1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2016 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/ArabicShaping.txt \
25 /usr/local/share/Unidata/Scripts.txt \
26 /usr/local/share/Unidata/Blocks.txt \
27 /usr/local/share/Unidata/PropList-3.0.1.txt \
28 /usr/local/share/Unidata/EastAsianWidth.txt \
29 /usr/local/share/Unidata/LineBreak.txt \
30 /usr/local/share/Unidata/WordBreakProperty.txt \
31 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
32 /usr/local/share/Unidata/CompositionExclusions.txt \
33 /usr/local/share/Unidata/SpecialCasing.txt \
34 /usr/local/share/Unidata/CaseFolding.txt \
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* ========================================================================= */
50 /* Reading UnicodeData.txt. */
53 /* This structure represents one line in the UnicodeData.txt file. */
54 struct unicode_attribute
56 const char *name; /* Character name */
57 const char *category; /* General category */
58 const char *combining; /* Canonical combining class */
59 const char *bidi; /* Bidirectional category */
60 const char *decomposition; /* Character decomposition mapping */
61 const char *decdigit; /* Decimal digit value */
62 const char *digit; /* Digit value */
63 const char *numeric; /* Numeric value */
64 bool mirrored; /* mirrored */
65 const char *oldname; /* Old Unicode 1.0 name */
66 const char *comment; /* Comment */
67 unsigned int upper; /* Uppercase mapping */
68 unsigned int lower; /* Lowercase mapping */
69 unsigned int title; /* Titlecase mapping */
72 /* Missing fields are represented with "" for strings, and NONE for
74 #define NONE (~(unsigned int)0)
76 /* The entire contents of the UnicodeData.txt file. */
77 struct unicode_attribute unicode_attributes [0x110000];
79 /* Stores in unicode_attributes[i] the values from the given fields. */
81 fill_attribute (unsigned int i,
82 const char *field1, const char *field2,
83 const char *field3, const char *field4,
84 const char *field5, const char *field6,
85 const char *field7, const char *field8,
86 const char *field9, const char *field10,
87 const char *field11, const char *field12,
88 const char *field13, const char *field14)
90 struct unicode_attribute * uni;
94 fprintf (stderr, "index too large\n");
97 if (strcmp (field2, "Cs") == 0)
98 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
100 uni = &unicode_attributes[i];
101 /* Copy the strings. */
102 uni->name = strdup (field1);
103 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
104 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
105 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
106 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
107 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
108 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
109 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
110 uni->mirrored = (field9[0] == 'Y');
111 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
112 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
113 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
114 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
115 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
118 /* Maximum length of a field in the UnicodeData.txt file. */
121 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
122 Reads up to (but excluding) DELIM.
123 Returns 1 when a field was successfully read, otherwise 0. */
125 getfield (FILE *stream, char *buffer, int delim)
130 for (; (c = getc (stream)), (c != EOF && c != delim); )
132 /* The original unicode.org UnicodeData.txt file happens to have
133 CR/LF line terminators. Silently convert to LF. */
137 /* Put c into the buffer. */
138 if (++count >= FIELDLEN - 1)
140 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
153 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
156 fill_attributes (const char *unicodedata_filename)
160 char field0[FIELDLEN];
161 char field1[FIELDLEN];
162 char field2[FIELDLEN];
163 char field3[FIELDLEN];
164 char field4[FIELDLEN];
165 char field5[FIELDLEN];
166 char field6[FIELDLEN];
167 char field7[FIELDLEN];
168 char field8[FIELDLEN];
169 char field9[FIELDLEN];
170 char field10[FIELDLEN];
171 char field11[FIELDLEN];
172 char field12[FIELDLEN];
173 char field13[FIELDLEN];
174 char field14[FIELDLEN];
177 for (i = 0; i < 0x110000; i++)
178 unicode_attributes[i].name = NULL;
180 stream = fopen (unicodedata_filename, "r");
183 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
192 n = getfield (stream, field0, ';');
193 n += getfield (stream, field1, ';');
194 n += getfield (stream, field2, ';');
195 n += getfield (stream, field3, ';');
196 n += getfield (stream, field4, ';');
197 n += getfield (stream, field5, ';');
198 n += getfield (stream, field6, ';');
199 n += getfield (stream, field7, ';');
200 n += getfield (stream, field8, ';');
201 n += getfield (stream, field9, ';');
202 n += getfield (stream, field10, ';');
203 n += getfield (stream, field11, ';');
204 n += getfield (stream, field12, ';');
205 n += getfield (stream, field13, ';');
206 n += getfield (stream, field14, '\n');
211 fprintf (stderr, "short line in '%s':%d\n",
212 unicodedata_filename, lineno);
215 i = strtoul (field0, NULL, 16);
217 && strlen (field1) >= 9
218 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
220 /* Deal with a range. */
222 n = getfield (stream, field0, ';');
223 n += getfield (stream, field1, ';');
224 n += getfield (stream, field2, ';');
225 n += getfield (stream, field3, ';');
226 n += getfield (stream, field4, ';');
227 n += getfield (stream, field5, ';');
228 n += getfield (stream, field6, ';');
229 n += getfield (stream, field7, ';');
230 n += getfield (stream, field8, ';');
231 n += getfield (stream, field9, ';');
232 n += getfield (stream, field10, ';');
233 n += getfield (stream, field11, ';');
234 n += getfield (stream, field12, ';');
235 n += getfield (stream, field13, ';');
236 n += getfield (stream, field14, '\n');
239 fprintf (stderr, "missing end range in '%s':%d\n",
240 unicodedata_filename, lineno);
243 if (!(field1[0] == '<'
244 && strlen (field1) >= 8
245 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
247 fprintf (stderr, "missing end range in '%s':%d\n",
248 unicodedata_filename, lineno);
251 field1[strlen (field1) - 7] = '\0';
252 j = strtoul (field0, NULL, 16);
254 fill_attribute (i, field1+1, field2, field3, field4, field5,
255 field6, field7, field8, field9, field10,
256 field11, field12, field13, field14);
260 /* Single character line */
261 fill_attribute (i, field1, field2, field3, field4, field5,
262 field6, field7, field8, field9, field10,
263 field11, field12, field13, field14);
267 if (ferror (stream) || fclose (stream))
269 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
274 /* ========================================================================= */
276 /* General category. */
277 /* See Unicode 3.0 book, section 4.5,
281 is_category_L (unsigned int ch)
283 return (unicode_attributes[ch].name != NULL
284 && unicode_attributes[ch].category[0] == 'L');
288 is_category_LC (unsigned int ch)
290 /* See PropertyValueAliases.txt. */
291 return (unicode_attributes[ch].name != NULL
292 && unicode_attributes[ch].category[0] == 'L'
293 && (unicode_attributes[ch].category[1] == 'u'
294 || unicode_attributes[ch].category[1] == 'l'
295 || unicode_attributes[ch].category[1] == 't'));
299 is_category_Lu (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 'u');
307 is_category_Ll (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'l');
315 is_category_Lt (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 't');
323 is_category_Lm (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'L'
327 && unicode_attributes[ch].category[1] == 'm');
331 is_category_Lo (unsigned int ch)
333 return (unicode_attributes[ch].name != NULL
334 && unicode_attributes[ch].category[0] == 'L'
335 && unicode_attributes[ch].category[1] == 'o');
339 is_category_M (unsigned int ch)
341 return (unicode_attributes[ch].name != NULL
342 && unicode_attributes[ch].category[0] == 'M');
346 is_category_Mn (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'n');
354 is_category_Mc (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'M'
358 && unicode_attributes[ch].category[1] == 'c');
362 is_category_Me (unsigned int ch)
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'M'
366 && unicode_attributes[ch].category[1] == 'e');
370 is_category_N (unsigned int ch)
372 return (unicode_attributes[ch].name != NULL
373 && unicode_attributes[ch].category[0] == 'N');
377 is_category_Nd (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'd');
385 is_category_Nl (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'N'
389 && unicode_attributes[ch].category[1] == 'l');
393 is_category_No (unsigned int ch)
395 return (unicode_attributes[ch].name != NULL
396 && unicode_attributes[ch].category[0] == 'N'
397 && unicode_attributes[ch].category[1] == 'o');
401 is_category_P (unsigned int ch)
403 return (unicode_attributes[ch].name != NULL
404 && unicode_attributes[ch].category[0] == 'P');
408 is_category_Pc (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 'c');
416 is_category_Pd (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'd');
424 is_category_Ps (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 's');
432 is_category_Pe (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'e');
440 is_category_Pi (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'i');
448 is_category_Pf (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'P'
452 && unicode_attributes[ch].category[1] == 'f');
456 is_category_Po (unsigned int ch)
458 return (unicode_attributes[ch].name != NULL
459 && unicode_attributes[ch].category[0] == 'P'
460 && unicode_attributes[ch].category[1] == 'o');
464 is_category_S (unsigned int ch)
466 return (unicode_attributes[ch].name != NULL
467 && unicode_attributes[ch].category[0] == 'S');
471 is_category_Sm (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'm');
479 is_category_Sc (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'c');
487 is_category_Sk (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'S'
491 && unicode_attributes[ch].category[1] == 'k');
495 is_category_So (unsigned int ch)
497 return (unicode_attributes[ch].name != NULL
498 && unicode_attributes[ch].category[0] == 'S'
499 && unicode_attributes[ch].category[1] == 'o');
503 is_category_Z (unsigned int ch)
505 return (unicode_attributes[ch].name != NULL
506 && unicode_attributes[ch].category[0] == 'Z');
510 is_category_Zs (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 's');
518 is_category_Zl (unsigned int ch)
520 return (unicode_attributes[ch].name != NULL
521 && unicode_attributes[ch].category[0] == 'Z'
522 && unicode_attributes[ch].category[1] == 'l');
526 is_category_Zp (unsigned int ch)
528 return (unicode_attributes[ch].name != NULL
529 && unicode_attributes[ch].category[0] == 'Z'
530 && unicode_attributes[ch].category[1] == 'p');
534 is_category_C (unsigned int ch)
536 return (unicode_attributes[ch].name == NULL
537 || unicode_attributes[ch].category[0] == 'C');
541 is_category_Cc (unsigned int ch)
543 return (unicode_attributes[ch].name != NULL
544 && unicode_attributes[ch].category[0] == 'C'
545 && unicode_attributes[ch].category[1] == 'c');
549 is_category_Cf (unsigned int ch)
551 return (unicode_attributes[ch].name != NULL
552 && unicode_attributes[ch].category[0] == 'C'
553 && unicode_attributes[ch].category[1] == 'f');
557 is_category_Cs (unsigned int ch)
559 return (ch >= 0xd800 && ch < 0xe000);
563 is_category_Co (unsigned int ch)
565 return (unicode_attributes[ch].name != NULL
566 && unicode_attributes[ch].category[0] == 'C'
567 && unicode_attributes[ch].category[1] == 'o');
571 is_category_Cn (unsigned int ch)
573 return (unicode_attributes[ch].name == NULL
574 && !(ch >= 0xd800 && ch < 0xe000));
577 /* Output a boolean property in a human readable format. */
579 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
584 stream = fopen (filename, "w");
587 fprintf (stderr, "cannot open '%s' for writing\n", filename);
591 #if 0 /* This yields huge text output. */
592 for (ch = 0; ch < 0x110000; ch++)
595 fprintf (stream, "0x%04X\n", ch);
598 for (ch = 0; ch < 0x110000; ch++)
601 unsigned int first = ch;
604 while (ch + 1 < 0x110000 && predicate (ch + 1))
608 fprintf (stream, "0x%04X..0x%04X\n", first, last);
610 fprintf (stream, "0x%04X\n", ch);
614 if (ferror (stream) || fclose (stream))
616 fprintf (stderr, "error writing to '%s'\n", filename);
621 /* Output the unit test for a boolean property. */
623 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
629 stream = fopen (filename, "w");
632 fprintf (stderr, "cannot open '%s' for writing\n", filename);
636 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
637 fprintf (stream, "/* Test the Unicode character type functions.\n");
638 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
639 fprintf (stream, "\n");
640 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
641 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
642 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
643 fprintf (stream, " (at your option) any later version.\n");
644 fprintf (stream, "\n");
645 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
646 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
647 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
648 fprintf (stream, " GNU General Public License for more details.\n");
649 fprintf (stream, "\n");
650 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
651 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
652 fprintf (stream, "\n");
653 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
654 fprintf (stream, "\n");
657 for (ch = 0; ch < 0x110000; ch++)
660 unsigned int first = ch;
663 while (ch + 1 < 0x110000 && predicate (ch + 1))
667 fprintf (stream, ",\n");
668 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
672 fprintf (stream, "\n");
674 fprintf (stream, "\n");
675 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
676 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
678 if (ferror (stream) || fclose (stream))
680 fprintf (stderr, "error writing to '%s'\n", filename);
685 /* Construction of sparse 3-level tables. */
686 #define TABLE predicate_table
687 #define xmalloc malloc
688 #define xrealloc realloc
689 #include "3levelbit.h"
691 /* Output a boolean property in a three-level bitmap. */
693 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
697 struct predicate_table t;
698 unsigned int level1_offset, level2_offset, level3_offset;
700 stream = fopen (filename, "w");
703 fprintf (stderr, "cannot open '%s' for writing\n", filename);
707 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
708 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
709 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
714 predicate_table_init (&t);
716 for (ch = 0; ch < 0x110000; ch++)
718 predicate_table_add (&t, ch);
720 predicate_table_finalize (&t);
722 /* Offsets in t.result, in memory of this process. */
724 5 * sizeof (uint32_t);
726 5 * sizeof (uint32_t)
727 + t.level1_size * sizeof (uint32_t);
729 5 * sizeof (uint32_t)
730 + t.level1_size * sizeof (uint32_t)
731 + (t.level2_size << t.q) * sizeof (uint32_t);
733 for (i = 0; i < 5; i++)
735 fprintf (stream, "#define header_%d %d\n", i,
736 ((uint32_t *) t.result)[i]);
738 fprintf (stream, "static const\n");
739 fprintf (stream, "struct\n");
740 fprintf (stream, " {\n");
741 fprintf (stream, " int header[1];\n");
742 fprintf (stream, " int level1[%zu];\n", t.level1_size);
743 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
744 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
745 fprintf (stream, " }\n");
746 fprintf (stream, "%s =\n", name);
747 fprintf (stream, "{\n");
748 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
749 fprintf (stream, " {");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 for (i = 0; i < t.level1_size; i++)
755 if (i > 0 && (i % 1) == 0)
756 fprintf (stream, "\n ");
757 offset = ((uint32_t *) (t.result + level1_offset))[i];
759 fprintf (stream, " %5d", -1);
761 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
762 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
763 if (i+1 < t.level1_size)
764 fprintf (stream, ",");
766 if (t.level1_size > 1)
767 fprintf (stream, "\n ");
768 fprintf (stream, " },\n");
769 fprintf (stream, " {");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 for (i = 0; i < t.level2_size << t.q; i++)
775 if (i > 0 && (i % 1) == 0)
776 fprintf (stream, "\n ");
777 offset = ((uint32_t *) (t.result + level2_offset))[i];
779 fprintf (stream, " %5d", -1);
781 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
782 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
783 if (i+1 < t.level2_size << t.q)
784 fprintf (stream, ",");
786 if (t.level2_size << t.q > 1)
787 fprintf (stream, "\n ");
788 fprintf (stream, " },\n");
789 fprintf (stream, " {");
790 if (t.level3_size << t.p > 4)
791 fprintf (stream, "\n ");
792 for (i = 0; i < t.level3_size << t.p; i++)
794 if (i > 0 && (i % 4) == 0)
795 fprintf (stream, "\n ");
796 fprintf (stream, " 0x%08X",
797 ((uint32_t *) (t.result + level3_offset))[i]);
798 if (i+1 < t.level3_size << t.p)
799 fprintf (stream, ",");
801 if (t.level3_size << t.p > 4)
802 fprintf (stream, "\n ");
803 fprintf (stream, " }\n");
804 fprintf (stream, "};\n");
806 if (ferror (stream) || fclose (stream))
808 fprintf (stderr, "error writing to '%s'\n", filename);
813 /* Output all categories. */
815 output_categories (const char *version)
817 #define CATEGORY(C) \
818 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
819 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
820 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
864 UC_CATEGORY_MASK_L = 0x0000001f,
865 UC_CATEGORY_MASK_LC = 0x00000007,
866 UC_CATEGORY_MASK_Lu = 0x00000001,
867 UC_CATEGORY_MASK_Ll = 0x00000002,
868 UC_CATEGORY_MASK_Lt = 0x00000004,
869 UC_CATEGORY_MASK_Lm = 0x00000008,
870 UC_CATEGORY_MASK_Lo = 0x00000010,
871 UC_CATEGORY_MASK_M = 0x000000e0,
872 UC_CATEGORY_MASK_Mn = 0x00000020,
873 UC_CATEGORY_MASK_Mc = 0x00000040,
874 UC_CATEGORY_MASK_Me = 0x00000080,
875 UC_CATEGORY_MASK_N = 0x00000700,
876 UC_CATEGORY_MASK_Nd = 0x00000100,
877 UC_CATEGORY_MASK_Nl = 0x00000200,
878 UC_CATEGORY_MASK_No = 0x00000400,
879 UC_CATEGORY_MASK_P = 0x0003f800,
880 UC_CATEGORY_MASK_Pc = 0x00000800,
881 UC_CATEGORY_MASK_Pd = 0x00001000,
882 UC_CATEGORY_MASK_Ps = 0x00002000,
883 UC_CATEGORY_MASK_Pe = 0x00004000,
884 UC_CATEGORY_MASK_Pi = 0x00008000,
885 UC_CATEGORY_MASK_Pf = 0x00010000,
886 UC_CATEGORY_MASK_Po = 0x00020000,
887 UC_CATEGORY_MASK_S = 0x003c0000,
888 UC_CATEGORY_MASK_Sm = 0x00040000,
889 UC_CATEGORY_MASK_Sc = 0x00080000,
890 UC_CATEGORY_MASK_Sk = 0x00100000,
891 UC_CATEGORY_MASK_So = 0x00200000,
892 UC_CATEGORY_MASK_Z = 0x01c00000,
893 UC_CATEGORY_MASK_Zs = 0x00400000,
894 UC_CATEGORY_MASK_Zl = 0x00800000,
895 UC_CATEGORY_MASK_Zp = 0x01000000,
896 UC_CATEGORY_MASK_C = 0x3e000000,
897 UC_CATEGORY_MASK_Cc = 0x02000000,
898 UC_CATEGORY_MASK_Cf = 0x04000000,
899 UC_CATEGORY_MASK_Cs = 0x08000000,
900 UC_CATEGORY_MASK_Co = 0x10000000,
901 UC_CATEGORY_MASK_Cn = 0x20000000
905 general_category_byname (const char *category_name)
907 if (category_name[0] != '\0'
908 && (category_name[1] == '\0' || category_name[2] == '\0'))
909 switch (category_name[0])
912 switch (category_name[1])
914 case '\0': return UC_CATEGORY_MASK_L;
915 case 'C': return UC_CATEGORY_MASK_LC;
916 case 'u': return UC_CATEGORY_MASK_Lu;
917 case 'l': return UC_CATEGORY_MASK_Ll;
918 case 't': return UC_CATEGORY_MASK_Lt;
919 case 'm': return UC_CATEGORY_MASK_Lm;
920 case 'o': return UC_CATEGORY_MASK_Lo;
924 switch (category_name[1])
926 case '\0': return UC_CATEGORY_MASK_M;
927 case 'n': return UC_CATEGORY_MASK_Mn;
928 case 'c': return UC_CATEGORY_MASK_Mc;
929 case 'e': return UC_CATEGORY_MASK_Me;
933 switch (category_name[1])
935 case '\0': return UC_CATEGORY_MASK_N;
936 case 'd': return UC_CATEGORY_MASK_Nd;
937 case 'l': return UC_CATEGORY_MASK_Nl;
938 case 'o': return UC_CATEGORY_MASK_No;
942 switch (category_name[1])
944 case '\0': return UC_CATEGORY_MASK_P;
945 case 'c': return UC_CATEGORY_MASK_Pc;
946 case 'd': return UC_CATEGORY_MASK_Pd;
947 case 's': return UC_CATEGORY_MASK_Ps;
948 case 'e': return UC_CATEGORY_MASK_Pe;
949 case 'i': return UC_CATEGORY_MASK_Pi;
950 case 'f': return UC_CATEGORY_MASK_Pf;
951 case 'o': return UC_CATEGORY_MASK_Po;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_S;
958 case 'm': return UC_CATEGORY_MASK_Sm;
959 case 'c': return UC_CATEGORY_MASK_Sc;
960 case 'k': return UC_CATEGORY_MASK_Sk;
961 case 'o': return UC_CATEGORY_MASK_So;
965 switch (category_name[1])
967 case '\0': return UC_CATEGORY_MASK_Z;
968 case 's': return UC_CATEGORY_MASK_Zs;
969 case 'l': return UC_CATEGORY_MASK_Zl;
970 case 'p': return UC_CATEGORY_MASK_Zp;
974 switch (category_name[1])
976 case '\0': return UC_CATEGORY_MASK_C;
977 case 'c': return UC_CATEGORY_MASK_Cc;
978 case 'f': return UC_CATEGORY_MASK_Cf;
979 case 's': return UC_CATEGORY_MASK_Cs;
980 case 'o': return UC_CATEGORY_MASK_Co;
981 case 'n': return UC_CATEGORY_MASK_Cn;
985 /* Invalid category name. */
989 /* Construction of sparse 3-level tables. */
990 #define TABLE category_table
991 #define ELEMENT uint8_t
992 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
993 #define xmalloc malloc
994 #define xrealloc realloc
997 /* Output the per-character category table. */
999 output_category (const char *filename, const char *version)
1003 struct category_table t;
1004 unsigned int level1_offset, level2_offset, level3_offset;
1005 uint16_t *level3_packed;
1007 stream = fopen (filename, "w");
1010 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1014 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1015 fprintf (stream, "/* Categories of Unicode characters. */\n");
1016 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1021 category_table_init (&t);
1023 for (ch = 0; ch < 0x110000; ch++)
1026 unsigned int log2_value;
1028 if (is_category_Cs (ch))
1029 value = UC_CATEGORY_MASK_Cs;
1030 else if (unicode_attributes[ch].name != NULL)
1031 value = general_category_byname (unicode_attributes[ch].category);
1035 /* Now value should contain exactly one bit. */
1036 assert (value != 0 && (value & (value - 1)) == 0);
1038 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1040 assert (log2_value <= 0x1f);
1042 category_table_add (&t, ch, log2_value);
1045 category_table_finalize (&t);
1047 /* Offsets in t.result, in memory of this process. */
1049 5 * sizeof (uint32_t);
1051 5 * sizeof (uint32_t)
1052 + t.level1_size * sizeof (uint32_t);
1054 5 * sizeof (uint32_t)
1055 + t.level1_size * sizeof (uint32_t)
1056 + (t.level2_size << t.q) * sizeof (uint32_t);
1058 for (i = 0; i < 5; i++)
1059 fprintf (stream, "#define category_header_%d %d\n", i,
1060 ((uint32_t *) t.result)[i]);
1061 fprintf (stream, "static const\n");
1062 fprintf (stream, "struct\n");
1063 fprintf (stream, " {\n");
1064 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1065 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1066 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1067 (1 << t.p) * 5 / 16);
1068 fprintf (stream, " }\n");
1069 fprintf (stream, "u_category =\n");
1070 fprintf (stream, "{\n");
1071 fprintf (stream, " {");
1072 if (t.level1_size > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level1_size; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level1_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level2_offset) / sizeof (uint32_t));
1085 if (i+1 < t.level1_size)
1086 fprintf (stream, ",");
1088 if (t.level1_size > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 fprintf (stream, " {");
1092 if (t.level2_size << t.q > 8)
1093 fprintf (stream, "\n ");
1094 for (i = 0; i < t.level2_size << t.q; i++)
1097 if (i > 0 && (i % 8) == 0)
1098 fprintf (stream, "\n ");
1099 offset = ((uint32_t *) (t.result + level2_offset))[i];
1101 fprintf (stream, " %5d", -1);
1103 fprintf (stream, " %5zu",
1104 (offset - level3_offset) / sizeof (uint8_t));
1105 if (i+1 < t.level2_size << t.q)
1106 fprintf (stream, ",");
1108 if (t.level2_size << t.q > 8)
1109 fprintf (stream, "\n ");
1110 fprintf (stream, " },\n");
1111 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1112 not 32-bit units, in order to make the lookup function easier. */
1115 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1116 for (i = 0; i < t.level3_size << t.p; i++)
1118 unsigned int j = (i * 5) / 16;
1119 unsigned int k = (i * 5) % 16;
1120 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1121 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1122 level3_packed[j] = value & 0xffff;
1123 level3_packed[j+1] = value >> 16;
1125 fprintf (stream, " {");
1126 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1127 fprintf (stream, "\n ");
1128 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1130 if (i > 0 && (i % 8) == 0)
1131 fprintf (stream, "\n ");
1132 fprintf (stream, " 0x%04x", level3_packed[i]);
1133 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1134 fprintf (stream, ",");
1136 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1137 fprintf (stream, "\n ");
1138 fprintf (stream, " }\n");
1139 free (level3_packed);
1140 fprintf (stream, "};\n");
1142 if (ferror (stream) || fclose (stream))
1144 fprintf (stderr, "error writing to '%s'\n", filename);
1149 /* ========================================================================= */
1151 /* Canonical combining class. */
1152 /* See Unicode 3.0 book, section 4.2,
1155 /* Construction of sparse 3-level tables. */
1156 #define TABLE combclass_table
1157 #define ELEMENT uint8_t
1159 #define xmalloc malloc
1160 #define xrealloc realloc
1163 /* Output the per-character combining class table. */
1165 output_combclass (const char *filename, const char *version)
1169 struct combclass_table t;
1170 unsigned int level1_offset, level2_offset, level3_offset;
1172 stream = fopen (filename, "w");
1175 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1179 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1180 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1181 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1186 combclass_table_init (&t);
1188 for (ch = 0; ch < 0x110000; ch++)
1189 if (unicode_attributes[ch].name != NULL)
1191 int value = atoi (unicode_attributes[ch].combining);
1192 assert (value >= 0 && value <= 255);
1193 combclass_table_add (&t, ch, value);
1196 combclass_table_finalize (&t);
1198 /* Offsets in t.result, in memory of this process. */
1200 5 * sizeof (uint32_t);
1202 5 * sizeof (uint32_t)
1203 + t.level1_size * sizeof (uint32_t);
1205 5 * sizeof (uint32_t)
1206 + t.level1_size * sizeof (uint32_t)
1207 + (t.level2_size << t.q) * sizeof (uint32_t);
1209 for (i = 0; i < 5; i++)
1210 fprintf (stream, "#define combclass_header_%d %d\n", i,
1211 ((uint32_t *) t.result)[i]);
1212 fprintf (stream, "static const\n");
1213 fprintf (stream, "struct\n");
1214 fprintf (stream, " {\n");
1215 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1216 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1217 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1218 fprintf (stream, " }\n");
1219 fprintf (stream, "u_combclass =\n");
1220 fprintf (stream, "{\n");
1221 fprintf (stream, " {");
1222 if (t.level1_size > 8)
1223 fprintf (stream, "\n ");
1224 for (i = 0; i < t.level1_size; i++)
1227 if (i > 0 && (i % 8) == 0)
1228 fprintf (stream, "\n ");
1229 offset = ((uint32_t *) (t.result + level1_offset))[i];
1231 fprintf (stream, " %5d", -1);
1233 fprintf (stream, " %5zu",
1234 (offset - level2_offset) / sizeof (uint32_t));
1235 if (i+1 < t.level1_size)
1236 fprintf (stream, ",");
1238 if (t.level1_size > 8)
1239 fprintf (stream, "\n ");
1240 fprintf (stream, " },\n");
1241 fprintf (stream, " {");
1242 if (t.level2_size << t.q > 8)
1243 fprintf (stream, "\n ");
1244 for (i = 0; i < t.level2_size << t.q; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 offset = ((uint32_t *) (t.result + level2_offset))[i];
1251 fprintf (stream, " %5d", -1);
1253 fprintf (stream, " %5zu",
1254 (offset - level3_offset) / sizeof (uint8_t));
1255 if (i+1 < t.level2_size << t.q)
1256 fprintf (stream, ",");
1258 if (t.level2_size << t.q > 8)
1259 fprintf (stream, "\n ");
1260 fprintf (stream, " },\n");
1261 fprintf (stream, " {");
1262 if (t.level3_size << t.p > 8)
1263 fprintf (stream, "\n ");
1264 for (i = 0; i < t.level3_size << t.p; i++)
1266 if (i > 0 && (i % 8) == 0)
1267 fprintf (stream, "\n ");
1268 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1269 if (i+1 < t.level3_size << t.p)
1270 fprintf (stream, ",");
1272 if (t.level3_size << t.p > 8)
1273 fprintf (stream, "\n ");
1274 fprintf (stream, " }\n");
1275 fprintf (stream, "};\n");
1277 if (ferror (stream) || fclose (stream))
1279 fprintf (stderr, "error writing to '%s'\n", filename);
1284 /* ========================================================================= */
1286 /* Bidirectional category. */
1287 /* See Unicode 3.0 book, section 4.3,
1292 UC_BIDI_L, /* Left-to-Right */
1293 UC_BIDI_LRE, /* Left-to-Right Embedding */
1294 UC_BIDI_LRO, /* Left-to-Right Override */
1295 UC_BIDI_R, /* Right-to-Left */
1296 UC_BIDI_AL, /* Right-to-Left Arabic */
1297 UC_BIDI_RLE, /* Right-to-Left Embedding */
1298 UC_BIDI_RLO, /* Right-to-Left Override */
1299 UC_BIDI_PDF, /* Pop Directional Format */
1300 UC_BIDI_EN, /* European Number */
1301 UC_BIDI_ES, /* European Number Separator */
1302 UC_BIDI_ET, /* European Number Terminator */
1303 UC_BIDI_AN, /* Arabic Number */
1304 UC_BIDI_CS, /* Common Number Separator */
1305 UC_BIDI_NSM, /* Non-Spacing Mark */
1306 UC_BIDI_BN, /* Boundary Neutral */
1307 UC_BIDI_B, /* Paragraph Separator */
1308 UC_BIDI_S, /* Segment Separator */
1309 UC_BIDI_WS, /* Whitespace */
1310 UC_BIDI_ON, /* Other Neutral */
1311 UC_BIDI_LRI, /* Left-to-Right Isolate */
1312 UC_BIDI_RLI, /* Right-to-Left Isolate */
1313 UC_BIDI_FSI, /* First Strong Isolate */
1314 UC_BIDI_PDI /* Pop Directional Isolate */
1318 bidi_category_byname (const char *category_name)
1320 switch (category_name[0])
1323 switch (category_name[1])
1326 if (category_name[2] == '\0')
1330 if (category_name[2] == '\0')
1336 switch (category_name[1])
1341 if (category_name[2] == '\0')
1347 switch (category_name[1])
1350 if (category_name[2] == '\0')
1356 switch (category_name[1])
1359 if (category_name[2] == '\0')
1363 if (category_name[2] == '\0')
1367 if (category_name[2] == '\0')
1373 switch (category_name[1])
1376 switch (category_name[2])
1379 if (category_name[3] == '\0')
1386 switch (category_name[1])
1391 switch (category_name[2])
1394 if (category_name[3] == '\0')
1398 if (category_name[3] == '\0')
1402 if (category_name[3] == '\0')
1410 switch (category_name[1])
1413 switch (category_name[2])
1416 if (category_name[3] == '\0')
1424 switch (category_name[1])
1427 if (category_name[2] == '\0')
1433 switch (category_name[1])
1436 switch (category_name[2])
1439 if (category_name[3] == '\0')
1443 if (category_name[3] == '\0')
1451 switch (category_name[1])
1456 switch (category_name[2])
1459 if (category_name[3] == '\0')
1463 if (category_name[3] == '\0')
1467 if (category_name[3] == '\0')
1475 if (category_name[1] == '\0')
1479 switch (category_name[1])
1482 if (category_name[2] == '\0')
1488 /* Invalid bidi category name. */
1493 get_bidi_category (unsigned int ch)
1495 if (unicode_attributes[ch].name != NULL)
1496 return bidi_category_byname (unicode_attributes[ch].bidi);
1499 /* The bidi category of unassigned characters depends on the range.
1500 See UTR #9 and DerivedBidiClass.txt. */
1501 if ((ch >= 0x0590 && ch <= 0x05FF)
1502 || (ch >= 0x07FB && ch <= 0x08FF)
1503 || (ch >= 0xFB37 && ch <= 0xFB45)
1504 || (ch >= 0x10800 && ch <= 0x10FFF))
1506 else if ((ch >= 0x0600 && ch <= 0x07BF)
1507 || (ch >= 0x2064 && ch <= 0x2069)
1508 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1509 || (ch >= 0xFDFE && ch <= 0xFEFE))
1511 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1512 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1513 || (ch & 0xFFFF) == 0xFFFE
1514 || (ch & 0xFFFF) == 0xFFFF
1515 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1522 /* Construction of sparse 3-level tables. */
1523 #define TABLE bidi_category_table
1524 #define ELEMENT uint8_t
1525 #define DEFAULT UC_BIDI_L
1526 #define xmalloc malloc
1527 #define xrealloc realloc
1530 /* Output the per-character bidi category table. */
1532 output_bidi_category (const char *filename, const char *version)
1536 struct bidi_category_table t;
1537 unsigned int level1_offset, level2_offset, level3_offset;
1538 uint16_t *level3_packed;
1540 stream = fopen (filename, "w");
1543 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1547 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1548 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1549 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1554 bidi_category_table_init (&t);
1556 for (ch = 0; ch < 0x110000; ch++)
1558 int value = get_bidi_category (ch);
1560 assert (value <= 0x1f);
1562 bidi_category_table_add (&t, ch, value);
1565 bidi_category_table_finalize (&t);
1567 /* Offsets in t.result, in memory of this process. */
1569 5 * sizeof (uint32_t);
1571 5 * sizeof (uint32_t)
1572 + t.level1_size * sizeof (uint32_t);
1574 5 * sizeof (uint32_t)
1575 + t.level1_size * sizeof (uint32_t)
1576 + (t.level2_size << t.q) * sizeof (uint32_t);
1578 for (i = 0; i < 5; i++)
1579 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1580 ((uint32_t *) t.result)[i]);
1581 fprintf (stream, "static const\n");
1582 fprintf (stream, "struct\n");
1583 fprintf (stream, " {\n");
1584 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1585 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1586 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1587 (1 << t.p) * 5 / 16);
1588 fprintf (stream, " }\n");
1589 fprintf (stream, "u_bidi_category =\n");
1590 fprintf (stream, "{\n");
1591 fprintf (stream, " {");
1592 if (t.level1_size > 8)
1593 fprintf (stream, "\n ");
1594 for (i = 0; i < t.level1_size; i++)
1597 if (i > 0 && (i % 8) == 0)
1598 fprintf (stream, "\n ");
1599 offset = ((uint32_t *) (t.result + level1_offset))[i];
1601 fprintf (stream, " %5d", -1);
1603 fprintf (stream, " %5zu",
1604 (offset - level2_offset) / sizeof (uint32_t));
1605 if (i+1 < t.level1_size)
1606 fprintf (stream, ",");
1608 if (t.level1_size > 8)
1609 fprintf (stream, "\n ");
1610 fprintf (stream, " },\n");
1611 fprintf (stream, " {");
1612 if (t.level2_size << t.q > 8)
1613 fprintf (stream, "\n ");
1614 for (i = 0; i < t.level2_size << t.q; i++)
1617 if (i > 0 && (i % 8) == 0)
1618 fprintf (stream, "\n ");
1619 offset = ((uint32_t *) (t.result + level2_offset))[i];
1621 fprintf (stream, " %5d", -1);
1623 fprintf (stream, " %5zu",
1624 (offset - level3_offset) / sizeof (uint8_t));
1625 if (i+1 < t.level2_size << t.q)
1626 fprintf (stream, ",");
1628 if (t.level2_size << t.q > 8)
1629 fprintf (stream, "\n ");
1630 fprintf (stream, " },\n");
1631 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1632 not 32-bit units, in order to make the lookup function easier. */
1635 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1636 for (i = 0; i < t.level3_size << t.p; i++)
1638 unsigned int j = (i * 5) / 16;
1639 unsigned int k = (i * 5) % 16;
1640 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1641 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1642 level3_packed[j] = value & 0xffff;
1643 level3_packed[j+1] = value >> 16;
1645 fprintf (stream, " {");
1646 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1647 fprintf (stream, "\n ");
1648 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1650 if (i > 0 && (i % 8) == 0)
1651 fprintf (stream, "\n ");
1652 fprintf (stream, " 0x%04x", level3_packed[i]);
1653 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1654 fprintf (stream, ",");
1656 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1657 fprintf (stream, "\n ");
1658 fprintf (stream, " }\n");
1659 free (level3_packed);
1660 fprintf (stream, "};\n");
1662 if (ferror (stream) || fclose (stream))
1664 fprintf (stderr, "error writing to '%s'\n", filename);
1669 /* ========================================================================= */
1671 /* Decimal digit value. */
1672 /* See Unicode 3.0 book, section 4.6. */
1675 get_decdigit_value (unsigned int ch)
1677 if (unicode_attributes[ch].name != NULL
1678 && unicode_attributes[ch].decdigit[0] != '\0')
1679 return atoi (unicode_attributes[ch].decdigit);
1683 /* Construction of sparse 3-level tables. */
1684 #define TABLE decdigit_table
1685 #define ELEMENT uint8_t
1687 #define xmalloc malloc
1688 #define xrealloc realloc
1691 /* Output the unit test for the per-character decimal digit value table. */
1693 output_decimal_digit_test (const char *filename, const char *version)
1699 stream = fopen (filename, "w");
1702 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1706 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1707 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1708 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = get_decdigit_value (ch);
1716 assert (value >= -1 && value < 10);
1721 fprintf (stream, ",\n");
1722 fprintf (stream, " { 0x%04X, %d }", ch, value);
1727 fprintf (stream, "\n");
1729 if (ferror (stream) || fclose (stream))
1731 fprintf (stderr, "error writing to '%s'\n", filename);
1736 /* Output the per-character decimal digit value table. */
1738 output_decimal_digit (const char *filename, const char *version)
1742 struct decdigit_table t;
1743 unsigned int level1_offset, level2_offset, level3_offset;
1745 stream = fopen (filename, "w");
1748 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1752 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1753 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1754 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1759 decdigit_table_init (&t);
1761 for (ch = 0; ch < 0x110000; ch++)
1763 int value = 1 + get_decdigit_value (ch);
1765 assert (value >= 0 && value <= 10);
1767 decdigit_table_add (&t, ch, value);
1770 decdigit_table_finalize (&t);
1772 /* Offsets in t.result, in memory of this process. */
1774 5 * sizeof (uint32_t);
1776 5 * sizeof (uint32_t)
1777 + t.level1_size * sizeof (uint32_t);
1779 5 * sizeof (uint32_t)
1780 + t.level1_size * sizeof (uint32_t)
1781 + (t.level2_size << t.q) * sizeof (uint32_t);
1783 for (i = 0; i < 5; i++)
1784 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1785 ((uint32_t *) t.result)[i]);
1786 fprintf (stream, "static const\n");
1787 fprintf (stream, "struct\n");
1788 fprintf (stream, " {\n");
1789 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1790 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1791 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1793 fprintf (stream, " }\n");
1794 fprintf (stream, "u_decdigit =\n");
1795 fprintf (stream, "{\n");
1796 fprintf (stream, " {");
1797 if (t.level1_size > 8)
1798 fprintf (stream, "\n ");
1799 for (i = 0; i < t.level1_size; i++)
1802 if (i > 0 && (i % 8) == 0)
1803 fprintf (stream, "\n ");
1804 offset = ((uint32_t *) (t.result + level1_offset))[i];
1806 fprintf (stream, " %5d", -1);
1808 fprintf (stream, " %5zu",
1809 (offset - level2_offset) / sizeof (uint32_t));
1810 if (i+1 < t.level1_size)
1811 fprintf (stream, ",");
1813 if (t.level1_size > 8)
1814 fprintf (stream, "\n ");
1815 fprintf (stream, " },\n");
1816 fprintf (stream, " {");
1817 if (t.level2_size << t.q > 8)
1818 fprintf (stream, "\n ");
1819 for (i = 0; i < t.level2_size << t.q; i++)
1822 if (i > 0 && (i % 8) == 0)
1823 fprintf (stream, "\n ");
1824 offset = ((uint32_t *) (t.result + level2_offset))[i];
1826 fprintf (stream, " %5d", -1);
1828 fprintf (stream, " %5zu",
1829 (offset - level3_offset) / sizeof (uint8_t));
1830 if (i+1 < t.level2_size << t.q)
1831 fprintf (stream, ",");
1833 if (t.level2_size << t.q > 8)
1834 fprintf (stream, "\n ");
1835 fprintf (stream, " },\n");
1836 /* Pack the level3 array. Each entry needs 4 bits only. */
1837 fprintf (stream, " {");
1838 if (t.level3_size << (t.p - 1) > 8)
1839 fprintf (stream, "\n ");
1840 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1842 if (i > 0 && (i % 8) == 0)
1843 fprintf (stream, "\n ");
1844 fprintf (stream, " 0x%02x",
1845 ((uint8_t *) (t.result + level3_offset))[2*i]
1846 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1847 if (i+1 < t.level3_size << (t.p - 1))
1848 fprintf (stream, ",");
1850 if (t.level3_size << (t.p - 1) > 8)
1851 fprintf (stream, "\n ");
1852 fprintf (stream, " }\n");
1853 fprintf (stream, "};\n");
1855 if (ferror (stream) || fclose (stream))
1857 fprintf (stderr, "error writing to '%s'\n", filename);
1862 /* ========================================================================= */
1865 /* See Unicode 3.0 book, section 4.6. */
1868 get_digit_value (unsigned int ch)
1870 if (unicode_attributes[ch].name != NULL
1871 && unicode_attributes[ch].digit[0] != '\0')
1872 return atoi (unicode_attributes[ch].digit);
1876 /* Output the unit test for the per-character digit value table. */
1878 output_digit_test (const char *filename, const char *version)
1884 stream = fopen (filename, "w");
1887 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1891 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1892 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1893 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1897 for (ch = 0; ch < 0x110000; ch++)
1899 int value = get_digit_value (ch);
1901 assert (value >= -1 && value < 10);
1906 fprintf (stream, ",\n");
1907 fprintf (stream, " { 0x%04X, %d }", ch, value);
1912 fprintf (stream, "\n");
1914 if (ferror (stream) || fclose (stream))
1916 fprintf (stderr, "error writing to '%s'\n", filename);
1921 /* Output the per-character digit value table. */
1923 output_digit (const char *filename, const char *version)
1927 struct decdigit_table t;
1928 unsigned int level1_offset, level2_offset, level3_offset;
1930 stream = fopen (filename, "w");
1933 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1937 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1938 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1939 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1944 decdigit_table_init (&t);
1946 for (ch = 0; ch < 0x110000; ch++)
1948 int value = 1 + get_digit_value (ch);
1950 assert (value >= 0 && value <= 10);
1952 decdigit_table_add (&t, ch, value);
1955 decdigit_table_finalize (&t);
1957 /* Offsets in t.result, in memory of this process. */
1959 5 * sizeof (uint32_t);
1961 5 * sizeof (uint32_t)
1962 + t.level1_size * sizeof (uint32_t);
1964 5 * sizeof (uint32_t)
1965 + t.level1_size * sizeof (uint32_t)
1966 + (t.level2_size << t.q) * sizeof (uint32_t);
1968 for (i = 0; i < 5; i++)
1969 fprintf (stream, "#define digit_header_%d %d\n", i,
1970 ((uint32_t *) t.result)[i]);
1971 fprintf (stream, "static const\n");
1972 fprintf (stream, "struct\n");
1973 fprintf (stream, " {\n");
1974 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1975 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1976 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1978 fprintf (stream, " }\n");
1979 fprintf (stream, "u_digit =\n");
1980 fprintf (stream, "{\n");
1981 fprintf (stream, " {");
1982 if (t.level1_size > 8)
1983 fprintf (stream, "\n ");
1984 for (i = 0; i < t.level1_size; i++)
1987 if (i > 0 && (i % 8) == 0)
1988 fprintf (stream, "\n ");
1989 offset = ((uint32_t *) (t.result + level1_offset))[i];
1991 fprintf (stream, " %5d", -1);
1993 fprintf (stream, " %5zu",
1994 (offset - level2_offset) / sizeof (uint32_t));
1995 if (i+1 < t.level1_size)
1996 fprintf (stream, ",");
1998 if (t.level1_size > 8)
1999 fprintf (stream, "\n ");
2000 fprintf (stream, " },\n");
2001 fprintf (stream, " {");
2002 if (t.level2_size << t.q > 8)
2003 fprintf (stream, "\n ");
2004 for (i = 0; i < t.level2_size << t.q; i++)
2007 if (i > 0 && (i % 8) == 0)
2008 fprintf (stream, "\n ");
2009 offset = ((uint32_t *) (t.result + level2_offset))[i];
2011 fprintf (stream, " %5d", -1);
2013 fprintf (stream, " %5zu",
2014 (offset - level3_offset) / sizeof (uint8_t));
2015 if (i+1 < t.level2_size << t.q)
2016 fprintf (stream, ",");
2018 if (t.level2_size << t.q > 8)
2019 fprintf (stream, "\n ");
2020 fprintf (stream, " },\n");
2021 /* Pack the level3 array. Each entry needs 4 bits only. */
2022 fprintf (stream, " {");
2023 if (t.level3_size << (t.p - 1) > 8)
2024 fprintf (stream, "\n ");
2025 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2027 if (i > 0 && (i % 8) == 0)
2028 fprintf (stream, "\n ");
2029 fprintf (stream, " 0x%02x",
2030 ((uint8_t *) (t.result + level3_offset))[2*i]
2031 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2032 if (i+1 < t.level3_size << (t.p - 1))
2033 fprintf (stream, ",");
2035 if (t.level3_size << (t.p - 1) > 8)
2036 fprintf (stream, "\n ");
2037 fprintf (stream, " }\n");
2038 fprintf (stream, "};\n");
2040 if (ferror (stream) || fclose (stream))
2042 fprintf (stderr, "error writing to '%s'\n", filename);
2047 /* ========================================================================= */
2049 /* Numeric value. */
2050 /* See Unicode 3.0 book, section 4.6. */
2052 typedef struct { int numerator; int denominator; } uc_fraction_t;
2054 static uc_fraction_t
2055 get_numeric_value (unsigned int ch)
2057 uc_fraction_t value;
2059 if (unicode_attributes[ch].name != NULL
2060 && unicode_attributes[ch].numeric[0] != '\0')
2062 const char *str = unicode_attributes[ch].numeric;
2063 /* str is of the form "integer" or "integer/posinteger". */
2064 value.numerator = atoi (str);
2065 if (strchr (str, '/') != NULL)
2066 value.denominator = atoi (strchr (str, '/') + 1);
2068 value.denominator = 1;
2072 value.numerator = 0;
2073 value.denominator = 0;
2078 /* Output the unit test for the per-character numeric value table. */
2080 output_numeric_test (const char *filename, const char *version)
2086 stream = fopen (filename, "w");
2089 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2093 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2094 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2095 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2099 for (ch = 0; ch < 0x110000; ch++)
2101 uc_fraction_t value = get_numeric_value (ch);
2103 if (value.numerator != 0 || value.denominator != 0)
2106 fprintf (stream, ",\n");
2107 fprintf (stream, " { 0x%04X, %d, %d }",
2108 ch, value.numerator, value.denominator);
2113 fprintf (stream, "\n");
2115 if (ferror (stream) || fclose (stream))
2117 fprintf (stderr, "error writing to '%s'\n", filename);
2122 /* Construction of sparse 3-level tables. */
2123 #define TABLE numeric_table
2124 #define ELEMENT uint8_t
2126 #define xmalloc malloc
2127 #define xrealloc realloc
2130 /* Output the per-character numeric value table. */
2132 output_numeric (const char *filename, const char *version)
2135 uc_fraction_t fractions[160];
2136 unsigned int nfractions;
2137 unsigned int ch, i, j;
2138 struct numeric_table t;
2139 unsigned int level1_offset, level2_offset, level3_offset;
2140 uint16_t *level3_packed;
2142 stream = fopen (filename, "w");
2145 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2149 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2150 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2151 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2154 /* Create table of occurring fractions. */
2156 for (ch = 0; ch < 0x110000; ch++)
2158 uc_fraction_t value = get_numeric_value (ch);
2160 for (i = 0; i < nfractions; i++)
2161 if (value.numerator == fractions[i].numerator
2162 && value.denominator == fractions[i].denominator)
2164 if (i == nfractions)
2166 assert (nfractions != SIZEOF (fractions));
2167 for (i = 0; i < nfractions; i++)
2168 if (value.denominator < fractions[i].denominator
2169 || (value.denominator == fractions[i].denominator
2170 && value.numerator < fractions[i].numerator))
2172 for (j = nfractions; j > i; j--)
2173 fractions[j] = fractions[j - 1];
2174 fractions[i] = value;
2179 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2181 fprintf (stream, "{\n");
2182 for (i = 0; i < nfractions; i++)
2184 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2185 fractions[i].denominator);
2186 if (i+1 < nfractions)
2187 fprintf (stream, ",");
2188 fprintf (stream, "\n");
2190 fprintf (stream, "};\n");
2194 numeric_table_init (&t);
2196 for (ch = 0; ch < 0x110000; ch++)
2198 uc_fraction_t value = get_numeric_value (ch);
2200 for (i = 0; i < nfractions; i++)
2201 if (value.numerator == fractions[i].numerator
2202 && value.denominator == fractions[i].denominator)
2204 assert (i != nfractions);
2206 numeric_table_add (&t, ch, i);
2209 numeric_table_finalize (&t);
2211 /* Offsets in t.result, in memory of this process. */
2213 5 * sizeof (uint32_t);
2215 5 * sizeof (uint32_t)
2216 + t.level1_size * sizeof (uint32_t);
2218 5 * sizeof (uint32_t)
2219 + t.level1_size * sizeof (uint32_t)
2220 + (t.level2_size << t.q) * sizeof (uint32_t);
2222 for (i = 0; i < 5; i++)
2223 fprintf (stream, "#define numeric_header_%d %d\n", i,
2224 ((uint32_t *) t.result)[i]);
2225 fprintf (stream, "static const\n");
2226 fprintf (stream, "struct\n");
2227 fprintf (stream, " {\n");
2228 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2229 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2230 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2231 (1 << t.p) * 8 / 16);
2232 fprintf (stream, " }\n");
2233 fprintf (stream, "u_numeric =\n");
2234 fprintf (stream, "{\n");
2235 fprintf (stream, " {");
2236 if (t.level1_size > 8)
2237 fprintf (stream, "\n ");
2238 for (i = 0; i < t.level1_size; i++)
2241 if (i > 0 && (i % 8) == 0)
2242 fprintf (stream, "\n ");
2243 offset = ((uint32_t *) (t.result + level1_offset))[i];
2245 fprintf (stream, " %5d", -1);
2247 fprintf (stream, " %5zu",
2248 (offset - level2_offset) / sizeof (uint32_t));
2249 if (i+1 < t.level1_size)
2250 fprintf (stream, ",");
2252 if (t.level1_size > 8)
2253 fprintf (stream, "\n ");
2254 fprintf (stream, " },\n");
2255 fprintf (stream, " {");
2256 if (t.level2_size << t.q > 8)
2257 fprintf (stream, "\n ");
2258 for (i = 0; i < t.level2_size << t.q; i++)
2261 if (i > 0 && (i % 8) == 0)
2262 fprintf (stream, "\n ");
2263 offset = ((uint32_t *) (t.result + level2_offset))[i];
2265 fprintf (stream, " %5d", -1);
2267 fprintf (stream, " %5zu",
2268 (offset - level3_offset) / sizeof (uint8_t));
2269 if (i+1 < t.level2_size << t.q)
2270 fprintf (stream, ",");
2272 if (t.level2_size << t.q > 8)
2273 fprintf (stream, "\n ");
2274 fprintf (stream, " },\n");
2275 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2276 not 32-bit units, in order to make the lookup function easier. */
2279 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2280 for (i = 0; i < t.level3_size << t.p; i++)
2282 unsigned int j = (i * 8) / 16;
2283 unsigned int k = (i * 8) % 16;
2284 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2285 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2286 level3_packed[j] = value & 0xffff;
2287 level3_packed[j+1] = value >> 16;
2289 fprintf (stream, " {");
2290 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2291 fprintf (stream, "\n ");
2292 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2294 if (i > 0 && (i % 8) == 0)
2295 fprintf (stream, "\n ");
2296 fprintf (stream, " 0x%04x", level3_packed[i]);
2297 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2298 fprintf (stream, ",");
2300 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2301 fprintf (stream, "\n ");
2302 fprintf (stream, " }\n");
2303 free (level3_packed);
2304 fprintf (stream, "};\n");
2306 if (ferror (stream) || fclose (stream))
2308 fprintf (stderr, "error writing to '%s'\n", filename);
2313 /* ========================================================================= */
2316 /* See Unicode 3.0 book, section 4.7,
2319 /* List of mirrored character pairs. This is a subset of the characters
2320 having the BidiMirrored property. */
2321 static unsigned int mirror_pairs[][2] =
2378 get_mirror_value (unsigned int ch)
2381 unsigned int mirror_char;
2384 mirrored = (unicode_attributes[ch].name != NULL
2385 && unicode_attributes[ch].mirrored);
2386 mirror_char = 0xfffd;
2387 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2388 if (ch == mirror_pairs[i][0])
2390 mirror_char = mirror_pairs[i][1];
2393 else if (ch == mirror_pairs[i][1])
2395 mirror_char = mirror_pairs[i][0];
2399 return (int) mirror_char - (int) ch;
2402 assert (mirror_char == 0xfffd);
2407 /* Construction of sparse 3-level tables. */
2408 #define TABLE mirror_table
2409 #define ELEMENT int32_t
2411 #define xmalloc malloc
2412 #define xrealloc realloc
2415 /* Output the per-character mirror table. */
2417 output_mirror (const char *filename, const char *version)
2421 struct mirror_table t;
2422 unsigned int level1_offset, level2_offset, level3_offset;
2424 stream = fopen (filename, "w");
2427 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2431 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2432 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2433 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2438 mirror_table_init (&t);
2440 for (ch = 0; ch < 0x110000; ch++)
2442 int value = get_mirror_value (ch);
2444 mirror_table_add (&t, ch, value);
2447 mirror_table_finalize (&t);
2449 /* Offsets in t.result, in memory of this process. */
2451 5 * sizeof (uint32_t);
2453 5 * sizeof (uint32_t)
2454 + t.level1_size * sizeof (uint32_t);
2456 5 * sizeof (uint32_t)
2457 + t.level1_size * sizeof (uint32_t)
2458 + (t.level2_size << t.q) * sizeof (uint32_t);
2460 for (i = 0; i < 5; i++)
2461 fprintf (stream, "#define mirror_header_%d %d\n", i,
2462 ((uint32_t *) t.result)[i]);
2463 fprintf (stream, "static const\n");
2464 fprintf (stream, "struct\n");
2465 fprintf (stream, " {\n");
2466 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2467 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2468 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2469 fprintf (stream, " }\n");
2470 fprintf (stream, "u_mirror =\n");
2471 fprintf (stream, "{\n");
2472 fprintf (stream, " {");
2473 if (t.level1_size > 8)
2474 fprintf (stream, "\n ");
2475 for (i = 0; i < t.level1_size; i++)
2478 if (i > 0 && (i % 8) == 0)
2479 fprintf (stream, "\n ");
2480 offset = ((uint32_t *) (t.result + level1_offset))[i];
2482 fprintf (stream, " %5d", -1);
2484 fprintf (stream, " %5zu",
2485 (offset - level2_offset) / sizeof (uint32_t));
2486 if (i+1 < t.level1_size)
2487 fprintf (stream, ",");
2489 if (t.level1_size > 8)
2490 fprintf (stream, "\n ");
2491 fprintf (stream, " },\n");
2492 fprintf (stream, " {");
2493 if (t.level2_size << t.q > 8)
2494 fprintf (stream, "\n ");
2495 for (i = 0; i < t.level2_size << t.q; i++)
2498 if (i > 0 && (i % 8) == 0)
2499 fprintf (stream, "\n ");
2500 offset = ((uint32_t *) (t.result + level2_offset))[i];
2502 fprintf (stream, " %5d", -1);
2504 fprintf (stream, " %5zu",
2505 (offset - level3_offset) / sizeof (int32_t));
2506 if (i+1 < t.level2_size << t.q)
2507 fprintf (stream, ",");
2509 if (t.level2_size << t.q > 8)
2510 fprintf (stream, "\n ");
2511 fprintf (stream, " },\n");
2512 fprintf (stream, " {");
2513 if (t.level3_size << t.p > 8)
2514 fprintf (stream, "\n ");
2515 for (i = 0; i < t.level3_size << t.p; i++)
2517 if (i > 0 && (i % 8) == 0)
2518 fprintf (stream, "\n ");
2519 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2520 if (i+1 < t.level3_size << t.p)
2521 fprintf (stream, ",");
2523 if (t.level3_size << t.p > 8)
2524 fprintf (stream, "\n ");
2525 fprintf (stream, " }\n");
2526 fprintf (stream, "};\n");
2528 if (ferror (stream) || fclose (stream))
2530 fprintf (stderr, "error writing to '%s'\n", filename);
2535 /* ========================================================================= */
2537 /* Particular values of the word break property. */
2540 is_WBP_MIDNUMLET (unsigned int ch)
2542 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2543 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2547 is_WBP_MIDLETTER (unsigned int ch)
2549 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2550 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2554 /* ========================================================================= */
2558 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2567 PROP_QUOTATION_MARK,
2568 PROP_TERMINAL_PUNCTUATION,
2571 PROP_ASCII_HEX_DIGIT,
2572 PROP_OTHER_ALPHABETIC,
2576 PROP_OTHER_LOWERCASE,
2577 PROP_OTHER_UPPERCASE,
2578 PROP_NONCHARACTER_CODE_POINT,
2579 PROP_OTHER_GRAPHEME_EXTEND,
2580 PROP_IDS_BINARY_OPERATOR,
2581 PROP_IDS_TRINARY_OPERATOR,
2583 PROP_UNIFIED_IDEOGRAPH,
2584 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2587 PROP_LOGICAL_ORDER_EXCEPTION,
2588 PROP_OTHER_ID_START,
2589 PROP_OTHER_ID_CONTINUE,
2591 PROP_VARIATION_SELECTOR,
2592 PROP_PATTERN_WHITE_SPACE,
2593 PROP_PATTERN_SYNTAX,
2594 /* DerivedCoreProperties.txt */
2600 PROP_CASE_IGNORABLE,
2601 PROP_CHANGES_WHEN_LOWERCASED,
2602 PROP_CHANGES_WHEN_UPPERCASED,
2603 PROP_CHANGES_WHEN_TITLECASED,
2604 PROP_CHANGES_WHEN_CASEFOLDED,
2605 PROP_CHANGES_WHEN_CASEMAPPED,
2610 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2611 PROP_GRAPHEME_EXTEND,
2615 unsigned long long unicode_properties[0x110000];
2618 clear_properties (void)
2622 for (i = 0; i < 0x110000; i++)
2623 unicode_properties[i] = 0;
2626 /* Stores in unicode_properties[] the properties from the
2627 PropList.txt or DerivedCoreProperties.txt file. */
2629 fill_properties (const char *proplist_filename)
2634 stream = fopen (proplist_filename, "r");
2637 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2644 unsigned int i1, i2;
2645 char padding[200+1];
2646 char propname[200+1];
2647 unsigned int propvalue;
2649 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2652 if (buf[0] == '\0' || buf[0] == '#')
2655 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2657 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2659 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2664 #define PROP(name,value) \
2665 if (strcmp (propname, name) == 0) propvalue = value; else
2667 PROP ("White_Space", PROP_WHITE_SPACE)
2668 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2669 PROP ("Join_Control", PROP_JOIN_CONTROL)
2670 PROP ("Dash", PROP_DASH)
2671 PROP ("Hyphen", PROP_HYPHEN)
2672 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2673 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2674 PROP ("Other_Math", PROP_OTHER_MATH)
2675 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2676 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2677 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2678 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2679 PROP ("Diacritic", PROP_DIACRITIC)
2680 PROP ("Extender", PROP_EXTENDER)
2681 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2682 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2683 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2684 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2685 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2686 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2687 PROP ("Radical", PROP_RADICAL)
2688 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2689 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2690 PROP ("Deprecated", PROP_DEPRECATED)
2691 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2692 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2693 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2694 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2695 PROP ("STerm", PROP_STERM)
2696 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2697 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2698 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2699 /* DerivedCoreProperties.txt */
2700 PROP ("Math", PROP_MATH)
2701 PROP ("Alphabetic", PROP_ALPHABETIC)
2702 PROP ("Lowercase", PROP_LOWERCASE)
2703 PROP ("Uppercase", PROP_UPPERCASE)
2704 PROP ("Cased", PROP_CASED)
2705 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2706 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2707 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2708 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2709 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2710 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2711 PROP ("ID_Start", PROP_ID_START)
2712 PROP ("ID_Continue", PROP_ID_CONTINUE)
2713 PROP ("XID_Start", PROP_XID_START)
2714 PROP ("XID_Continue", PROP_XID_CONTINUE)
2715 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2716 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2717 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2718 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2721 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2725 assert (i1 <= i2 && i2 < 0x110000);
2727 for (i = i1; i <= i2; i++)
2728 unicode_properties[i] |= 1ULL << propvalue;
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2738 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2741 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2747 for (i = 0; i < 0x110000; i++)
2750 stream = fopen (proplist_filename, "r");
2753 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2757 /* Search for the "Property dump for: ..." line. */
2760 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2762 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2766 while (strstr (buf, property_name) == NULL);
2770 unsigned int i1, i2;
2772 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2776 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2778 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2780 fprintf (stderr, "parse error in property in '%s'\n",
2785 else if (strlen (buf) >= 4)
2787 if (sscanf (buf, "%4X", &i1) < 1)
2789 fprintf (stderr, "parse error in property in '%s'\n",
2797 fprintf (stderr, "parse error in property in '%s'\n",
2801 assert (i1 <= i2 && i2 < 0x110000);
2802 for (i = i1; i <= i2; i++)
2806 if (ferror (stream) || fclose (stream))
2808 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2813 /* Properties from Unicode 3.0 PropList.txt file. */
2815 /* The paired punctuation property from the PropList.txt file. */
2816 char unicode_pairedpunctuation[0x110000];
2818 /* The left of pair property from the PropList.txt file. */
2819 char unicode_leftofpair[0x110000];
2822 fill_properties30 (const char *proplist30_filename)
2824 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2825 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2828 /* ------------------------------------------------------------------------- */
2830 /* See PropList.txt, UCD.html. */
2832 is_property_white_space (unsigned int ch)
2834 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2837 /* See Unicode 3.0 book, section 4.10,
2838 PropList.txt, UCD.html,
2839 DerivedCoreProperties.txt, UCD.html. */
2841 is_property_alphabetic (unsigned int ch)
2845 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2846 /* For some reason, the following are listed as having property
2847 Alphabetic but not as having property Other_Alphabetic. */
2848 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2849 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2850 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2851 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2852 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2853 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2854 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2855 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2856 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2857 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2858 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2859 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2860 || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
2862 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2864 assert (result1 == result2);
2868 /* See PropList.txt, UCD.html. */
2870 is_property_other_alphabetic (unsigned int ch)
2872 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2875 /* See PropList.txt, UCD.html. */
2877 is_property_not_a_character (unsigned int ch)
2879 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2882 /* See PropList.txt, UCD.html,
2883 DerivedCoreProperties.txt, UCD.html. */
2885 is_property_default_ignorable_code_point (unsigned int ch)
2888 (is_category_Cf (ch)
2889 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2890 && !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
2891 /* For some reason, the following are not listed as having property
2892 Default_Ignorable_Code_Point. */
2893 && !(ch == 0x110BD))
2894 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2895 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2897 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2899 assert (result1 == result2);
2903 /* See PropList.txt, UCD.html. */
2905 is_property_other_default_ignorable_code_point (unsigned int ch)
2907 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2910 /* See PropList.txt, UCD.html. */
2912 is_property_deprecated (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2917 /* See PropList.txt, UCD.html. */
2919 is_property_logical_order_exception (unsigned int ch)
2921 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2924 /* See PropList.txt, UCD.html. */
2926 is_property_variation_selector (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2931 /* See PropList-3.0.1.txt. */
2933 is_property_private_use (unsigned int ch)
2935 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2936 return (ch >= 0xE000 && ch <= 0xF8FF)
2937 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2938 || (ch >= 0x100000 && ch <= 0x10FFFD);
2941 /* See PropList-3.0.1.txt. */
2943 is_property_unassigned_code_value (unsigned int ch)
2945 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2948 /* See PropList.txt, UCD.html,
2949 DerivedCoreProperties.txt, UCD.html. */
2951 is_property_uppercase (unsigned int ch)
2955 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2957 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2959 assert (result1 == result2);
2963 /* See PropList.txt, UCD.html. */
2965 is_property_other_uppercase (unsigned int ch)
2967 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2970 /* See PropList.txt, UCD.html,
2971 DerivedCoreProperties.txt, UCD.html. */
2973 is_property_lowercase (unsigned int ch)
2977 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2979 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2981 assert (result1 == result2);
2985 /* See PropList.txt, UCD.html. */
2987 is_property_other_lowercase (unsigned int ch)
2989 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2992 /* See PropList-3.0.1.txt. */
2994 is_property_titlecase (unsigned int ch)
2996 return is_category_Lt (ch);
2999 /* See DerivedCoreProperties.txt. */
3001 is_property_cased (unsigned int ch)
3003 bool result1 = (is_property_lowercase (ch)
3004 || is_property_uppercase (ch)
3005 || is_category_Lt (ch));
3006 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3008 assert (result1 == result2);
3012 /* See DerivedCoreProperties.txt. */
3014 is_property_case_ignorable (unsigned int ch)
3016 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3018 || is_category_Mn (ch)
3019 || is_category_Me (ch)
3020 || is_category_Cf (ch)
3021 || is_category_Lm (ch)
3022 || is_category_Sk (ch));
3023 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3025 assert (result1 == result2);
3029 /* See DerivedCoreProperties.txt. */
3031 is_property_changes_when_lowercased (unsigned int ch)
3033 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3034 bool result2 = (unicode_attributes[ch].name != NULL
3035 && unicode_attributes[ch].lower != NONE
3036 && unicode_attributes[ch].lower != ch);
3038 assert (result1 == result2);
3042 /* See DerivedCoreProperties.txt. */
3044 is_property_changes_when_uppercased (unsigned int ch)
3046 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3049 /* See DerivedCoreProperties.txt. */
3051 is_property_changes_when_titlecased (unsigned int ch)
3053 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3056 /* See DerivedCoreProperties.txt. */
3058 is_property_changes_when_casefolded (unsigned int ch)
3060 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3063 /* See DerivedCoreProperties.txt. */
3065 is_property_changes_when_casemapped (unsigned int ch)
3067 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3070 /* See PropList.txt, UCD.html. */
3072 is_property_soft_dotted (unsigned int ch)
3074 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3077 /* See DerivedCoreProperties.txt, UCD.html. */
3079 is_property_id_start (unsigned int ch)
3081 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3084 /* See PropList.txt, UCD.html. */
3086 is_property_other_id_start (unsigned int ch)
3088 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3091 /* See DerivedCoreProperties.txt, UCD.html. */
3093 is_property_id_continue (unsigned int ch)
3095 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3098 /* See PropList.txt, UCD.html. */
3100 is_property_other_id_continue (unsigned int ch)
3102 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3105 /* See DerivedCoreProperties.txt, UCD.html. */
3107 is_property_xid_start (unsigned int ch)
3109 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3112 /* See DerivedCoreProperties.txt, UCD.html. */
3114 is_property_xid_continue (unsigned int ch)
3116 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3119 /* See PropList.txt, UCD.html. */
3121 is_property_pattern_white_space (unsigned int ch)
3123 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3126 /* See PropList.txt, UCD.html. */
3128 is_property_pattern_syntax (unsigned int ch)
3130 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3133 /* See PropList.txt, UCD.html. */
3135 is_property_join_control (unsigned int ch)
3137 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3140 /* See DerivedCoreProperties.txt, UCD.html. */
3142 is_property_grapheme_base (unsigned int ch)
3144 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3147 /* See DerivedCoreProperties.txt, UCD.html. */
3149 is_property_grapheme_extend (unsigned int ch)
3151 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3154 /* See PropList.txt, UCD.html. */
3156 is_property_other_grapheme_extend (unsigned int ch)
3158 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3161 /* See DerivedCoreProperties.txt, UCD.html. */
3163 is_property_grapheme_link (unsigned int ch)
3165 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3168 /* See PropList.txt, UCD.html. */
3170 is_property_bidi_control (unsigned int ch)
3172 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3175 /* See PropList-3.0.1.txt. */
3177 is_property_bidi_left_to_right (unsigned int ch)
3179 return (get_bidi_category (ch) == UC_BIDI_L);
3182 /* See PropList-3.0.1.txt. */
3184 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3186 return (get_bidi_category (ch) == UC_BIDI_R);
3189 /* See PropList-3.0.1.txt. */
3191 is_property_bidi_arabic_right_to_left (unsigned int ch)
3193 return (get_bidi_category (ch) == UC_BIDI_AL);
3196 /* See PropList-3.0.1.txt. */
3198 is_property_bidi_european_digit (unsigned int ch)
3200 return (get_bidi_category (ch) == UC_BIDI_EN);
3203 /* See PropList-3.0.1.txt. */
3205 is_property_bidi_eur_num_separator (unsigned int ch)
3207 return (get_bidi_category (ch) == UC_BIDI_ES);
3210 /* See PropList-3.0.1.txt. */
3212 is_property_bidi_eur_num_terminator (unsigned int ch)
3214 return (get_bidi_category (ch) == UC_BIDI_ET);
3217 /* See PropList-3.0.1.txt. */
3219 is_property_bidi_arabic_digit (unsigned int ch)
3221 return (get_bidi_category (ch) == UC_BIDI_AN);
3224 /* See PropList-3.0.1.txt. */
3226 is_property_bidi_common_separator (unsigned int ch)
3228 return (get_bidi_category (ch) == UC_BIDI_CS);
3231 /* See PropList-3.0.1.txt. */
3233 is_property_bidi_block_separator (unsigned int ch)
3235 return (get_bidi_category (ch) == UC_BIDI_B);
3238 /* See PropList-3.0.1.txt. */
3240 is_property_bidi_segment_separator (unsigned int ch)
3242 return (get_bidi_category (ch) == UC_BIDI_S);
3245 /* See PropList-3.0.1.txt. */
3247 is_property_bidi_whitespace (unsigned int ch)
3249 return (get_bidi_category (ch) == UC_BIDI_WS);
3252 /* See PropList-3.0.1.txt. */
3254 is_property_bidi_non_spacing_mark (unsigned int ch)
3256 return (get_bidi_category (ch) == UC_BIDI_NSM);
3259 /* See PropList-3.0.1.txt. */
3261 is_property_bidi_boundary_neutral (unsigned int ch)
3263 return (get_bidi_category (ch) == UC_BIDI_BN);
3266 /* See PropList-3.0.1.txt. */
3268 is_property_bidi_pdf (unsigned int ch)
3270 return (get_bidi_category (ch) == UC_BIDI_PDF);
3273 /* See PropList-3.0.1.txt. */
3275 is_property_bidi_embedding_or_override (unsigned int ch)
3277 int category = get_bidi_category (ch);
3278 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3279 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3282 /* See PropList-3.0.1.txt. */
3284 is_property_bidi_other_neutral (unsigned int ch)
3286 return (get_bidi_category (ch) == UC_BIDI_ON);
3289 /* See PropList.txt, UCD.html. */
3291 is_property_hex_digit (unsigned int ch)
3293 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3296 /* See PropList.txt, UCD.html. */
3298 is_property_ascii_hex_digit (unsigned int ch)
3300 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3303 /* See Unicode 3.0 book, section 4.10,
3304 PropList.txt, UCD.html. */
3306 is_property_ideographic (unsigned int ch)
3308 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3311 /* See PropList.txt, UCD.html. */
3313 is_property_unified_ideograph (unsigned int ch)
3315 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3318 /* See PropList.txt, UCD.html. */
3320 is_property_radical (unsigned int ch)
3322 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3325 /* See PropList.txt, UCD.html. */
3327 is_property_ids_binary_operator (unsigned int ch)
3329 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3332 /* See PropList.txt, UCD.html. */
3334 is_property_ids_trinary_operator (unsigned int ch)
3336 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3339 /* See PropList-3.0.1.txt. */
3341 is_property_zero_width (unsigned int ch)
3343 return is_category_Cf (ch)
3344 || (unicode_attributes[ch].name != NULL
3345 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3348 /* See PropList-3.0.1.txt. */
3350 is_property_space (unsigned int ch)
3352 return is_category_Zs (ch);
3355 /* See PropList-3.0.1.txt. */
3357 is_property_non_break (unsigned int ch)
3359 /* This is exactly the set of characters having line breaking
3361 return (ch == 0x00A0 /* NO-BREAK SPACE */
3362 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3363 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3364 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3365 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3366 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3367 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3368 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3369 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3370 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3371 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3372 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3373 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3374 || ch == 0x2007 /* FIGURE SPACE */
3375 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3376 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3379 /* See PropList-3.0.1.txt. */
3381 is_property_iso_control (unsigned int ch)
3384 (unicode_attributes[ch].name != NULL
3385 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3387 is_category_Cc (ch);
3389 assert (result1 == result2);
3393 /* See PropList-3.0.1.txt. */
3395 is_property_format_control (unsigned int ch)
3397 return (is_category_Cf (ch)
3398 && get_bidi_category (ch) == UC_BIDI_BN
3399 && !is_property_join_control (ch)
3403 /* See PropList.txt, UCD.html. */
3405 is_property_dash (unsigned int ch)
3407 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3410 /* See PropList.txt, UCD.html. */
3412 is_property_hyphen (unsigned int ch)
3414 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3417 /* See PropList-3.0.1.txt. */
3419 is_property_punctuation (unsigned int ch)
3421 return is_category_P (ch);
3424 /* See PropList-3.0.1.txt. */
3426 is_property_line_separator (unsigned int ch)
3428 return is_category_Zl (ch);
3431 /* See PropList-3.0.1.txt. */
3433 is_property_paragraph_separator (unsigned int ch)
3435 return is_category_Zp (ch);
3438 /* See PropList.txt, UCD.html. */
3440 is_property_quotation_mark (unsigned int ch)
3442 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3445 /* See PropList.txt, UCD.html. */
3447 is_property_sentence_terminal (unsigned int ch)
3449 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3452 /* See PropList.txt, UCD.html. */
3454 is_property_terminal_punctuation (unsigned int ch)
3456 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3459 /* See PropList-3.0.1.txt. */
3461 is_property_currency_symbol (unsigned int ch)
3463 return is_category_Sc (ch);
3466 /* See Unicode 3.0 book, section 4.9,
3467 PropList.txt, UCD.html,
3468 DerivedCoreProperties.txt, UCD.html. */
3470 is_property_math (unsigned int ch)
3474 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3476 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3478 assert (result1 == result2);
3482 /* See PropList.txt, UCD.html. */
3484 is_property_other_math (unsigned int ch)
3486 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3489 /* See PropList-3.0.1.txt. */
3491 is_property_paired_punctuation (unsigned int ch)
3493 return unicode_pairedpunctuation[ch];
3496 /* See PropList-3.0.1.txt. */
3498 is_property_left_of_pair (unsigned int ch)
3500 return unicode_leftofpair[ch];
3503 /* See PropList-3.0.1.txt. */
3505 is_property_combining (unsigned int ch)
3507 return (unicode_attributes[ch].name != NULL
3508 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3509 || is_category_Mc (ch)
3510 || is_category_Me (ch)
3511 || is_category_Mn (ch)));
3514 #if 0 /* same as is_property_bidi_non_spacing_mark */
3515 /* See PropList-3.0.1.txt. */
3517 is_property_non_spacing (unsigned int ch)
3519 return (unicode_attributes[ch].name != NULL
3520 && get_bidi_category (ch) == UC_BIDI_NSM);
3524 /* See PropList-3.0.1.txt. */
3526 is_property_composite (unsigned int ch)
3528 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3529 logical in some sense. */
3530 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3532 if (unicode_attributes[ch].name != NULL
3533 && unicode_attributes[ch].decomposition != NULL)
3535 /* Test whether the decomposition contains more than one character,
3536 and the first is not a space. */
3537 const char *decomp = unicode_attributes[ch].decomposition;
3538 if (decomp[0] == '<')
3540 decomp = strchr (decomp, '>') + 1;
3541 if (decomp[0] == ' ')
3544 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3549 /* See PropList-3.0.1.txt. */
3551 is_property_decimal_digit (unsigned int ch)
3553 return is_category_Nd (ch);
3556 /* See PropList-3.0.1.txt. */
3558 is_property_numeric (unsigned int ch)
3560 return ((get_numeric_value (ch)).denominator > 0)
3561 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3562 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3565 /* See PropList.txt, UCD.html. */
3567 is_property_diacritic (unsigned int ch)
3569 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3572 /* See PropList.txt, UCD.html. */
3574 is_property_extender (unsigned int ch)
3576 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3579 /* See PropList-3.0.1.txt. */
3581 is_property_ignorable_control (unsigned int ch)
3583 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3584 || is_category_Cf (ch))
3588 /* ------------------------------------------------------------------------- */
3590 /* Output all properties. */
3592 output_properties (const char *version)
3594 #define PROPERTY(P) \
3595 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3596 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3597 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3598 PROPERTY(white_space)
3599 PROPERTY(alphabetic)
3600 PROPERTY(other_alphabetic)
3601 PROPERTY(not_a_character)
3602 PROPERTY(default_ignorable_code_point)
3603 PROPERTY(other_default_ignorable_code_point)
3604 PROPERTY(deprecated)
3605 PROPERTY(logical_order_exception)
3606 PROPERTY(variation_selector)
3607 PROPERTY(private_use)
3608 PROPERTY(unassigned_code_value)
3610 PROPERTY(other_uppercase)
3612 PROPERTY(other_lowercase)
3615 PROPERTY(case_ignorable)
3616 PROPERTY(changes_when_lowercased)
3617 PROPERTY(changes_when_uppercased)
3618 PROPERTY(changes_when_titlecased)
3619 PROPERTY(changes_when_casefolded)
3620 PROPERTY(changes_when_casemapped)
3621 PROPERTY(soft_dotted)
3623 PROPERTY(other_id_start)
3624 PROPERTY(id_continue)
3625 PROPERTY(other_id_continue)
3627 PROPERTY(xid_continue)
3628 PROPERTY(pattern_white_space)
3629 PROPERTY(pattern_syntax)
3630 PROPERTY(join_control)
3631 PROPERTY(grapheme_base)
3632 PROPERTY(grapheme_extend)
3633 PROPERTY(other_grapheme_extend)
3634 PROPERTY(grapheme_link)
3635 PROPERTY(bidi_control)
3636 PROPERTY(bidi_left_to_right)
3637 PROPERTY(bidi_hebrew_right_to_left)
3638 PROPERTY(bidi_arabic_right_to_left)
3639 PROPERTY(bidi_european_digit)
3640 PROPERTY(bidi_eur_num_separator)
3641 PROPERTY(bidi_eur_num_terminator)
3642 PROPERTY(bidi_arabic_digit)
3643 PROPERTY(bidi_common_separator)
3644 PROPERTY(bidi_block_separator)
3645 PROPERTY(bidi_segment_separator)
3646 PROPERTY(bidi_whitespace)
3647 PROPERTY(bidi_non_spacing_mark)
3648 PROPERTY(bidi_boundary_neutral)
3650 PROPERTY(bidi_embedding_or_override)
3651 PROPERTY(bidi_other_neutral)
3653 PROPERTY(ascii_hex_digit)
3654 PROPERTY(ideographic)
3655 PROPERTY(unified_ideograph)
3657 PROPERTY(ids_binary_operator)
3658 PROPERTY(ids_trinary_operator)
3659 PROPERTY(zero_width)
3662 PROPERTY(iso_control)
3663 PROPERTY(format_control)
3666 PROPERTY(punctuation)
3667 PROPERTY(line_separator)
3668 PROPERTY(paragraph_separator)
3669 PROPERTY(quotation_mark)
3670 PROPERTY(sentence_terminal)
3671 PROPERTY(terminal_punctuation)
3672 PROPERTY(currency_symbol)
3674 PROPERTY(other_math)
3675 PROPERTY(paired_punctuation)
3676 PROPERTY(left_of_pair)
3679 PROPERTY(decimal_digit)
3683 PROPERTY(ignorable_control)
3687 /* ========================================================================= */
3689 /* Arabic Shaping. */
3693 UC_JOINING_TYPE_U, /* Non_Joining */
3694 UC_JOINING_TYPE_T, /* Transparent */
3695 UC_JOINING_TYPE_C, /* Join_Causing */
3696 UC_JOINING_TYPE_L, /* Left_Joining */
3697 UC_JOINING_TYPE_R, /* Right_Joining */
3698 UC_JOINING_TYPE_D /* Dual_Joining */
3701 static uint8_t unicode_joining_type[0x110000];
3705 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
3706 UC_JOINING_GROUP_AIN, /* Ain */
3707 UC_JOINING_GROUP_ALAPH, /* Alaph */
3708 UC_JOINING_GROUP_ALEF, /* Alef */
3709 UC_JOINING_GROUP_BEH, /* Beh */
3710 UC_JOINING_GROUP_BETH, /* Beth */
3711 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3712 UC_JOINING_GROUP_DAL, /* Dal */
3713 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
3714 UC_JOINING_GROUP_E, /* E */
3715 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
3716 UC_JOINING_GROUP_FE, /* Fe */
3717 UC_JOINING_GROUP_FEH, /* Feh */
3718 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
3719 UC_JOINING_GROUP_GAF, /* Gaf */
3720 UC_JOINING_GROUP_GAMAL, /* Gamal */
3721 UC_JOINING_GROUP_HAH, /* Hah */
3722 UC_JOINING_GROUP_HE, /* He */
3723 UC_JOINING_GROUP_HEH, /* Heh */
3724 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
3725 UC_JOINING_GROUP_HETH, /* Heth */
3726 UC_JOINING_GROUP_KAF, /* Kaf */
3727 UC_JOINING_GROUP_KAPH, /* Kaph */
3728 UC_JOINING_GROUP_KHAPH, /* Khaph */
3729 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
3730 UC_JOINING_GROUP_LAM, /* Lam */
3731 UC_JOINING_GROUP_LAMADH, /* Lamadh */
3732 UC_JOINING_GROUP_MEEM, /* Meem */
3733 UC_JOINING_GROUP_MIM, /* Mim */
3734 UC_JOINING_GROUP_NOON, /* Noon */
3735 UC_JOINING_GROUP_NUN, /* Nun */
3736 UC_JOINING_GROUP_NYA, /* Nya */
3737 UC_JOINING_GROUP_PE, /* Pe */
3738 UC_JOINING_GROUP_QAF, /* Qaf */
3739 UC_JOINING_GROUP_QAPH, /* Qaph */
3740 UC_JOINING_GROUP_REH, /* Reh */
3741 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
3742 UC_JOINING_GROUP_SAD, /* Sad */
3743 UC_JOINING_GROUP_SADHE, /* Sadhe */
3744 UC_JOINING_GROUP_SEEN, /* Seen */
3745 UC_JOINING_GROUP_SEMKATH, /* Semkath */
3746 UC_JOINING_GROUP_SHIN, /* Shin */
3747 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
3748 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
3749 UC_JOINING_GROUP_TAH, /* Tah */
3750 UC_JOINING_GROUP_TAW, /* Taw */
3751 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
3752 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
3753 UC_JOINING_GROUP_TETH, /* Teth */
3754 UC_JOINING_GROUP_WAW, /* Waw */
3755 UC_JOINING_GROUP_YEH, /* Yeh */
3756 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
3757 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
3758 UC_JOINING_GROUP_YUDH, /* Yudh */
3759 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
3760 UC_JOINING_GROUP_ZAIN, /* Zain */
3761 UC_JOINING_GROUP_ZHAIN, /* Zhain */
3762 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
3763 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
3764 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
3765 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
3766 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
3767 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
3768 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
3769 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
3770 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
3771 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
3772 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
3773 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
3774 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
3775 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
3776 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
3777 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
3778 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
3779 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
3780 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
3781 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
3782 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
3783 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
3784 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
3785 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
3786 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
3787 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
3788 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
3789 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
3790 UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
3793 static uint8_t unicode_joining_group[0x110000];
3796 fill_arabicshaping (const char *arabicshaping_filename)
3802 stream = fopen (arabicshaping_filename, "r");
3805 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3809 for (i = 0; i < 0x110000; i++)
3811 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3812 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3819 char separator1[100+1];
3820 char padding1[100+1];
3821 char schematic_name[100+1];
3822 char separator2[100+1];
3823 char padding2[100+1];
3824 char joining_type_name[100+1];
3825 char separator3[100+1];
3826 char padding3[100+1];
3827 char joining_group_name[100+1];
3832 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3835 if (buf[0] == '\0' || buf[0] == '#')
3838 if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
3839 &i, separator1, padding1, schematic_name, separator2,
3840 padding2, joining_type_name, separator3, padding3,
3841 joining_group_name) != 10)
3843 fprintf (stderr, "parse error in '%s':%d\n",
3844 arabicshaping_filename, lineno);
3847 assert (i < 0x110000);
3849 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3851 TRY(UC_JOINING_TYPE_U)
3852 TRY(UC_JOINING_TYPE_T)
3853 TRY(UC_JOINING_TYPE_C)
3854 TRY(UC_JOINING_TYPE_L)
3855 TRY(UC_JOINING_TYPE_R)
3856 TRY(UC_JOINING_TYPE_D)
3860 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3861 joining_type_name, arabicshaping_filename, lineno);
3865 /* Remove trailing spaces. */
3866 while (joining_group_name[0] != '\0'
3867 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3868 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3870 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3872 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3873 TRY(UC_JOINING_GROUP_AIN, "AIN")
3874 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
3875 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
3876 TRY(UC_JOINING_GROUP_BEH, "BEH")
3877 TRY(UC_JOINING_GROUP_BETH, "BETH")
3878 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
3879 TRY(UC_JOINING_GROUP_DAL, "DAL")
3880 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
3881 TRY(UC_JOINING_GROUP_E, "E")
3882 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
3883 TRY(UC_JOINING_GROUP_FE, "FE")
3884 TRY(UC_JOINING_GROUP_FEH, "FEH")
3885 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
3886 TRY(UC_JOINING_GROUP_GAF, "GAF")
3887 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
3888 TRY(UC_JOINING_GROUP_HAH, "HAH")
3889 TRY(UC_JOINING_GROUP_HE, "HE")
3890 TRY(UC_JOINING_GROUP_HEH, "HEH")
3891 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
3892 TRY(UC_JOINING_GROUP_HETH, "HETH")
3893 TRY(UC_JOINING_GROUP_KAF, "KAF")
3894 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
3895 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
3896 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
3897 TRY(UC_JOINING_GROUP_LAM, "LAM")
3898 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
3899 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
3900 TRY(UC_JOINING_GROUP_MIM, "MIM")
3901 TRY(UC_JOINING_GROUP_NOON, "NOON")
3902 TRY(UC_JOINING_GROUP_NUN, "NUN")
3903 TRY(UC_JOINING_GROUP_NYA, "NYA")
3904 TRY(UC_JOINING_GROUP_PE, "PE")
3905 TRY(UC_JOINING_GROUP_QAF, "QAF")
3906 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
3907 TRY(UC_JOINING_GROUP_REH, "REH")
3908 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
3909 TRY(UC_JOINING_GROUP_SAD, "SAD")
3910 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
3911 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
3912 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
3913 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
3914 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
3915 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
3916 TRY(UC_JOINING_GROUP_TAH, "TAH")
3917 TRY(UC_JOINING_GROUP_TAW, "TAW")
3918 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
3919 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
3920 TRY(UC_JOINING_GROUP_TETH, "TETH")
3921 TRY(UC_JOINING_GROUP_WAW, "WAW")
3922 TRY(UC_JOINING_GROUP_YEH, "YEH")
3923 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
3924 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
3925 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
3926 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
3927 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
3928 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
3929 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
3930 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
3931 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
3932 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
3933 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
3934 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
3935 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
3936 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
3937 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
3938 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
3939 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
3940 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
3941 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
3942 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
3943 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
3944 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
3945 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
3946 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
3947 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
3948 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
3949 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
3950 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
3951 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
3952 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
3953 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
3954 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
3955 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
3956 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
3957 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
3961 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
3962 joining_group_name, arabicshaping_filename, lineno);
3966 unicode_joining_type[i] = joining_type;
3967 unicode_joining_group[i] = joining_group;
3970 if (ferror (stream) || fclose (stream))
3972 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
3977 /* Convert a Joining_Type value to a C identifier. */
3979 joining_type_as_c_identifier (int joining_type)
3981 #define TRY(value) if (joining_type == value) return #value;
3982 TRY(UC_JOINING_TYPE_U)
3983 TRY(UC_JOINING_TYPE_T)
3984 TRY(UC_JOINING_TYPE_C)
3985 TRY(UC_JOINING_TYPE_L)
3986 TRY(UC_JOINING_TYPE_R)
3987 TRY(UC_JOINING_TYPE_D)
3993 output_joining_type_test (const char *filename, const char *version)
3999 stream = fopen (filename, "w");
4002 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4006 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4007 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4008 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4012 for (ch = 0; ch < 0x110000; ch++)
4014 int value = unicode_joining_type[ch];
4016 if (value != (uint8_t)~(uint8_t)0)
4019 fprintf (stream, ",\n");
4020 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4025 fprintf (stream, "\n");
4027 if (ferror (stream) || fclose (stream))
4029 fprintf (stderr, "error writing to '%s'\n", filename);
4034 /* Construction of sparse 3-level tables. */
4035 #define TABLE joining_type_table
4036 #define ELEMENT uint8_t
4037 #define DEFAULT (uint8_t)~(uint8_t)0
4038 #define xmalloc malloc
4039 #define xrealloc realloc
4043 output_joining_type (const char *filename, const char *version)
4047 struct joining_type_table t;
4048 unsigned int level1_offset, level2_offset, level3_offset;
4049 uint8_t *level3_packed;
4051 stream = fopen (filename, "w");
4054 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4058 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4059 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4060 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4065 joining_type_table_init (&t);
4067 for (ch = 0; ch < 0x110000; ch++)
4069 uint8_t value = unicode_joining_type[ch];
4071 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4073 joining_type_table_add (&t, ch, value);
4076 joining_type_table_finalize (&t);
4078 /* Offsets in t.result, in memory of this process. */
4080 5 * sizeof (uint32_t);
4082 5 * sizeof (uint32_t)
4083 + t.level1_size * sizeof (uint32_t);
4085 5 * sizeof (uint32_t)
4086 + t.level1_size * sizeof (uint32_t)
4087 + (t.level2_size << t.q) * sizeof (uint32_t);
4089 for (i = 0; i < 5; i++)
4090 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4091 ((uint32_t *) t.result)[i]);
4092 fprintf (stream, "static const\n");
4093 fprintf (stream, "struct\n");
4094 fprintf (stream, " {\n");
4095 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4096 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4097 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4098 (1 << t.p) * 4 / 8);
4099 fprintf (stream, " }\n");
4100 fprintf (stream, "u_joining_type =\n");
4101 fprintf (stream, "{\n");
4102 fprintf (stream, " {");
4103 if (t.level1_size > 8)
4104 fprintf (stream, "\n ");
4105 for (i = 0; i < t.level1_size; i++)
4108 if (i > 0 && (i % 8) == 0)
4109 fprintf (stream, "\n ");
4110 offset = ((uint32_t *) (t.result + level1_offset))[i];
4112 fprintf (stream, " %5d", -1);
4114 fprintf (stream, " %5zu",
4115 (offset - level2_offset) / sizeof (uint32_t));
4116 if (i+1 < t.level1_size)
4117 fprintf (stream, ",");
4119 if (t.level1_size > 8)
4120 fprintf (stream, "\n ");
4121 fprintf (stream, " },\n");
4122 fprintf (stream, " {");
4123 if (t.level2_size << t.q > 8)
4124 fprintf (stream, "\n ");
4125 for (i = 0; i < t.level2_size << t.q; i++)
4128 if (i > 0 && (i % 8) == 0)
4129 fprintf (stream, "\n ");
4130 offset = ((uint32_t *) (t.result + level2_offset))[i];
4132 fprintf (stream, " %5d", -1);
4134 fprintf (stream, " %5zu",
4135 (offset - level3_offset) / sizeof (uint8_t));
4136 if (i+1 < t.level2_size << t.q)
4137 fprintf (stream, ",");
4139 if (t.level2_size << t.q > 8)
4140 fprintf (stream, "\n ");
4141 fprintf (stream, " },\n");
4142 /* Pack the level3 array. Each entry needs 4 bits only. */
4144 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4145 for (i = 0; i < t.level3_size << t.p; i++)
4147 unsigned int j = (i * 4) / 8;
4148 unsigned int k = (i * 4) % 8;
4149 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4150 level3_packed[j] |= (value << k);
4152 fprintf (stream, " {");
4153 if ((t.level3_size << t.p) * 4 / 8 > 8)
4154 fprintf (stream, "\n ");
4155 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4157 if (i > 0 && (i % 8) == 0)
4158 fprintf (stream, "\n ");
4159 fprintf (stream, " 0x%02x", level3_packed[i]);
4160 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4161 fprintf (stream, ",");
4163 if ((t.level3_size << t.p) * 4 / 8 > 8)
4164 fprintf (stream, "\n ");
4165 fprintf (stream, " }\n");
4166 free (level3_packed);
4167 fprintf (stream, "};\n");
4169 if (ferror (stream) || fclose (stream))
4171 fprintf (stderr, "error writing to '%s'\n", filename);
4176 /* Convert a Joining_Group value to a C identifier. */
4178 joining_group_as_c_identifier (int joining_group)
4180 #define TRY(value) if (joining_group == value) return #value;
4181 TRY(UC_JOINING_GROUP_NONE)
4182 TRY(UC_JOINING_GROUP_AIN)
4183 TRY(UC_JOINING_GROUP_ALAPH)
4184 TRY(UC_JOINING_GROUP_ALEF)
4185 TRY(UC_JOINING_GROUP_BEH)
4186 TRY(UC_JOINING_GROUP_BETH)
4187 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4188 TRY(UC_JOINING_GROUP_DAL)
4189 TRY(UC_JOINING_GROUP_DALATH_RISH)
4190 TRY(UC_JOINING_GROUP_E)
4191 TRY(UC_JOINING_GROUP_FARSI_YEH)
4192 TRY(UC_JOINING_GROUP_FE)
4193 TRY(UC_JOINING_GROUP_FEH)
4194 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4195 TRY(UC_JOINING_GROUP_GAF)
4196 TRY(UC_JOINING_GROUP_GAMAL)
4197 TRY(UC_JOINING_GROUP_HAH)
4198 TRY(UC_JOINING_GROUP_HE)
4199 TRY(UC_JOINING_GROUP_HEH)
4200 TRY(UC_JOINING_GROUP_HEH_GOAL)
4201 TRY(UC_JOINING_GROUP_HETH)
4202 TRY(UC_JOINING_GROUP_KAF)
4203 TRY(UC_JOINING_GROUP_KAPH)
4204 TRY(UC_JOINING_GROUP_KHAPH)
4205 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4206 TRY(UC_JOINING_GROUP_LAM)
4207 TRY(UC_JOINING_GROUP_LAMADH)
4208 TRY(UC_JOINING_GROUP_MEEM)
4209 TRY(UC_JOINING_GROUP_MIM)
4210 TRY(UC_JOINING_GROUP_NOON)
4211 TRY(UC_JOINING_GROUP_NUN)
4212 TRY(UC_JOINING_GROUP_NYA)
4213 TRY(UC_JOINING_GROUP_PE)
4214 TRY(UC_JOINING_GROUP_QAF)
4215 TRY(UC_JOINING_GROUP_QAPH)
4216 TRY(UC_JOINING_GROUP_REH)
4217 TRY(UC_JOINING_GROUP_REVERSED_PE)
4218 TRY(UC_JOINING_GROUP_SAD)
4219 TRY(UC_JOINING_GROUP_SADHE)
4220 TRY(UC_JOINING_GROUP_SEEN)
4221 TRY(UC_JOINING_GROUP_SEMKATH)
4222 TRY(UC_JOINING_GROUP_SHIN)
4223 TRY(UC_JOINING_GROUP_SWASH_KAF)
4224 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4225 TRY(UC_JOINING_GROUP_TAH)
4226 TRY(UC_JOINING_GROUP_TAW)
4227 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4228 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4229 TRY(UC_JOINING_GROUP_TETH)
4230 TRY(UC_JOINING_GROUP_WAW)
4231 TRY(UC_JOINING_GROUP_YEH)
4232 TRY(UC_JOINING_GROUP_YEH_BARREE)
4233 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4234 TRY(UC_JOINING_GROUP_YUDH)
4235 TRY(UC_JOINING_GROUP_YUDH_HE)
4236 TRY(UC_JOINING_GROUP_ZAIN)
4237 TRY(UC_JOINING_GROUP_ZHAIN)
4238 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4239 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4240 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4241 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4242 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4243 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4244 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4245 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4246 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4247 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4248 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4249 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4250 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4251 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4252 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4253 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4254 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4255 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4256 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4257 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4258 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4259 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4260 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4261 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4262 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4263 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4264 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4265 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4266 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4272 output_joining_group_test (const char *filename, const char *version)
4278 stream = fopen (filename, "w");
4281 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4285 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4286 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4287 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4291 for (ch = 0; ch < 0x110000; ch++)
4293 int value = unicode_joining_group[ch];
4295 if (value != UC_JOINING_GROUP_NONE)
4298 fprintf (stream, ",\n");
4299 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4304 fprintf (stream, "\n");
4306 if (ferror (stream) || fclose (stream))
4308 fprintf (stderr, "error writing to '%s'\n", filename);
4313 /* Construction of sparse 3-level tables. */
4314 #define TABLE joining_group_table
4315 #define ELEMENT uint8_t
4316 #define DEFAULT UC_JOINING_GROUP_NONE
4317 #define xmalloc malloc
4318 #define xrealloc realloc
4322 output_joining_group (const char *filename, const char *version)
4326 struct joining_group_table t;
4327 unsigned int level1_offset, level2_offset, level3_offset;
4328 uint16_t *level3_packed;
4330 stream = fopen (filename, "w");
4333 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4337 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4338 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4339 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4344 joining_group_table_init (&t);
4346 for (ch = 0; ch < 0x110000; ch++)
4348 uint8_t value = unicode_joining_group[ch];
4350 assert (value <= 0x7f);
4352 joining_group_table_add (&t, ch, value);
4355 joining_group_table_finalize (&t);
4357 /* Offsets in t.result, in memory of this process. */
4359 5 * sizeof (uint32_t);
4361 5 * sizeof (uint32_t)
4362 + t.level1_size * sizeof (uint32_t);
4364 5 * sizeof (uint32_t)
4365 + t.level1_size * sizeof (uint32_t)
4366 + (t.level2_size << t.q) * sizeof (uint32_t);
4368 for (i = 0; i < 5; i++)
4369 fprintf (stream, "#define joining_group_header_%d %d\n", i,
4370 ((uint32_t *) t.result)[i]);
4371 fprintf (stream, "static const\n");
4372 fprintf (stream, "struct\n");
4373 fprintf (stream, " {\n");
4374 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4375 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4376 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
4377 (1 << t.p) * 7 / 16);
4378 fprintf (stream, " }\n");
4379 fprintf (stream, "u_joining_group =\n");
4380 fprintf (stream, "{\n");
4381 fprintf (stream, " {");
4382 if (t.level1_size > 8)
4383 fprintf (stream, "\n ");
4384 for (i = 0; i < t.level1_size; i++)
4387 if (i > 0 && (i % 8) == 0)
4388 fprintf (stream, "\n ");
4389 offset = ((uint32_t *) (t.result + level1_offset))[i];
4391 fprintf (stream, " %5d", -1);
4393 fprintf (stream, " %5zu",
4394 (offset - level2_offset) / sizeof (uint32_t));
4395 if (i+1 < t.level1_size)
4396 fprintf (stream, ",");
4398 if (t.level1_size > 8)
4399 fprintf (stream, "\n ");
4400 fprintf (stream, " },\n");
4401 fprintf (stream, " {");
4402 if (t.level2_size << t.q > 8)
4403 fprintf (stream, "\n ");
4404 for (i = 0; i < t.level2_size << t.q; i++)
4407 if (i > 0 && (i % 8) == 0)
4408 fprintf (stream, "\n ");
4409 offset = ((uint32_t *) (t.result + level2_offset))[i];
4411 fprintf (stream, " %5d", -1);
4413 fprintf (stream, " %5zu",
4414 (offset - level3_offset) / sizeof (uint8_t));
4415 if (i+1 < t.level2_size << t.q)
4416 fprintf (stream, ",");
4418 if (t.level2_size << t.q > 8)
4419 fprintf (stream, "\n ");
4420 fprintf (stream, " },\n");
4421 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
4422 not 32-bit units, in order to make the lookup function easier. */
4425 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
4426 for (i = 0; i < t.level3_size << t.p; i++)
4428 unsigned int j = (i * 7) / 16;
4429 unsigned int k = (i * 7) % 16;
4430 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
4431 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
4432 level3_packed[j] = value & 0xffff;
4433 level3_packed[j+1] = value >> 16;
4435 fprintf (stream, " {");
4436 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4437 fprintf (stream, "\n ");
4438 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
4440 if (i > 0 && (i % 8) == 0)
4441 fprintf (stream, "\n ");
4442 fprintf (stream, " 0x%04x", level3_packed[i]);
4443 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
4444 fprintf (stream, ",");
4446 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4447 fprintf (stream, "\n ");
4448 fprintf (stream, " }\n");
4449 free (level3_packed);
4450 fprintf (stream, "};\n");
4452 if (ferror (stream) || fclose (stream))
4454 fprintf (stderr, "error writing to '%s'\n", filename);
4459 /* ========================================================================= */
4463 static const char *scripts[256];
4464 static unsigned int numscripts;
4466 static uint8_t unicode_scripts[0x110000];
4469 fill_scripts (const char *scripts_filename)
4474 stream = fopen (scripts_filename, "r");
4477 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4483 for (i = 0; i < 0x110000; i++)
4484 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4489 unsigned int i1, i2;
4490 char padding[200+1];
4491 char scriptname[200+1];
4494 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4497 if (buf[0] == '\0' || buf[0] == '#')
4500 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4502 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4504 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4510 assert (i2 < 0x110000);
4512 for (script = numscripts - 1; script >= 0; script--)
4513 if (strcmp (scripts[script], scriptname) == 0)
4517 scripts[numscripts] = strdup (scriptname);
4518 script = numscripts;
4520 assert (numscripts != 256);
4523 for (i = i1; i <= i2; i++)
4525 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4526 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4527 unicode_scripts[i] = script;
4531 if (ferror (stream) || fclose (stream))
4533 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4538 /* Construction of sparse 3-level tables. */
4539 #define TABLE script_table
4540 #define ELEMENT uint8_t
4541 #define DEFAULT (uint8_t)~(uint8_t)0
4542 #define xmalloc malloc
4543 #define xrealloc realloc
4547 output_scripts (const char *version)
4549 const char *filename = "unictype/scripts.h";
4551 unsigned int ch, s, i;
4552 struct script_table t;
4553 unsigned int level1_offset, level2_offset, level3_offset;
4557 const char *lowercase_name;
4560 scriptinfo_t scriptinfo[256];
4562 stream = fopen (filename, "w");
4565 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4569 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4570 fprintf (stream, "/* Unicode scripts. */\n");
4571 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4574 for (s = 0; s < numscripts; s++)
4576 char *lcp = strdup (scripts[s]);
4579 for (cp = lcp; *cp != '\0'; cp++)
4580 if (*cp >= 'A' && *cp <= 'Z')
4583 scriptinfo[s].lowercase_name = lcp;
4586 for (s = 0; s < numscripts; s++)
4588 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4589 scriptinfo[s].lowercase_name);
4590 fprintf (stream, "{\n");
4592 for (ch = 0; ch < 0x110000; ch++)
4593 if (unicode_scripts[ch] == s)
4599 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4604 fprintf (stream, ",\n");
4606 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4608 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4612 fprintf (stream, "\n");
4613 fprintf (stream, "};\n");
4616 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4617 fprintf (stream, "{\n");
4618 for (s = 0; s < numscripts; s++)
4620 fprintf (stream, " {\n");
4621 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4622 scriptinfo[s].lowercase_name);
4623 fprintf (stream, " script_%s_intervals,\n",
4624 scriptinfo[s].lowercase_name);
4625 fprintf (stream, " \"%s\"\n", scripts[s]);
4626 fprintf (stream, " }");
4627 if (s+1 < numscripts)
4628 fprintf (stream, ",");
4629 fprintf (stream, "\n");
4631 fprintf (stream, "};\n");
4635 script_table_init (&t);
4637 for (ch = 0; ch < 0x110000; ch++)
4639 unsigned int s = unicode_scripts[ch];
4640 if (s != (uint8_t)~(uint8_t)0)
4641 script_table_add (&t, ch, s);
4644 script_table_finalize (&t);
4646 /* Offsets in t.result, in memory of this process. */
4648 5 * sizeof (uint32_t);
4650 5 * sizeof (uint32_t)
4651 + t.level1_size * sizeof (uint32_t);
4653 5 * sizeof (uint32_t)
4654 + t.level1_size * sizeof (uint32_t)
4655 + (t.level2_size << t.q) * sizeof (uint32_t);
4657 for (i = 0; i < 5; i++)
4658 fprintf (stream, "#define script_header_%d %d\n", i,
4659 ((uint32_t *) t.result)[i]);
4660 fprintf (stream, "static const\n");
4661 fprintf (stream, "struct\n");
4662 fprintf (stream, " {\n");
4663 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4664 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4665 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4666 fprintf (stream, " }\n");
4667 fprintf (stream, "u_script =\n");
4668 fprintf (stream, "{\n");
4669 fprintf (stream, " {");
4670 if (t.level1_size > 8)
4671 fprintf (stream, "\n ");
4672 for (i = 0; i < t.level1_size; i++)
4675 if (i > 0 && (i % 8) == 0)
4676 fprintf (stream, "\n ");
4677 offset = ((uint32_t *) (t.result + level1_offset))[i];
4679 fprintf (stream, " %5d", -1);
4681 fprintf (stream, " %5zu",
4682 (offset - level2_offset) / sizeof (uint32_t));
4683 if (i+1 < t.level1_size)
4684 fprintf (stream, ",");
4686 if (t.level1_size > 8)
4687 fprintf (stream, "\n ");
4688 fprintf (stream, " },\n");
4689 fprintf (stream, " {");
4690 if (t.level2_size << t.q > 8)
4691 fprintf (stream, "\n ");
4692 for (i = 0; i < t.level2_size << t.q; i++)
4695 if (i > 0 && (i % 8) == 0)
4696 fprintf (stream, "\n ");
4697 offset = ((uint32_t *) (t.result + level2_offset))[i];
4699 fprintf (stream, " %5d", -1);
4701 fprintf (stream, " %5zu",
4702 (offset - level3_offset) / sizeof (uint8_t));
4703 if (i+1 < t.level2_size << t.q)
4704 fprintf (stream, ",");
4706 if (t.level2_size << t.q > 8)
4707 fprintf (stream, "\n ");
4708 fprintf (stream, " },\n");
4709 fprintf (stream, " {");
4710 if (t.level3_size << t.p > 8)
4711 fprintf (stream, "\n ");
4712 for (i = 0; i < t.level3_size << t.p; i++)
4714 if (i > 0 && (i % 8) == 0)
4715 fprintf (stream, "\n ");
4716 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4717 if (i+1 < t.level3_size << t.p)
4718 fprintf (stream, ",");
4720 if (t.level3_size << t.p > 8)
4721 fprintf (stream, "\n ");
4722 fprintf (stream, " }\n");
4723 fprintf (stream, "};\n");
4725 if (ferror (stream) || fclose (stream))
4727 fprintf (stderr, "error writing to '%s'\n", filename);
4733 output_scripts_byname (const char *version)
4735 const char *filename = "unictype/scripts_byname.gperf";
4739 stream = fopen (filename, "w");
4742 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4746 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4747 fprintf (stream, "/* Unicode scripts. */\n");
4748 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4750 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4751 fprintf (stream, "%%struct-type\n");
4752 fprintf (stream, "%%language=ANSI-C\n");
4753 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4754 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4755 fprintf (stream, "%%readonly-tables\n");
4756 fprintf (stream, "%%global-table\n");
4757 fprintf (stream, "%%define word-array-name script_names\n");
4758 fprintf (stream, "%%pic\n");
4759 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4760 fprintf (stream, "%%%%\n");
4761 for (s = 0; s < numscripts; s++)
4762 fprintf (stream, "%s, %u\n", scripts[s], s);
4764 if (ferror (stream) || fclose (stream))
4766 fprintf (stderr, "error writing to '%s'\n", filename);
4771 /* ========================================================================= */
4775 typedef struct { unsigned int start; unsigned int end; const char *name; }
4777 static block_t blocks[384];
4778 static unsigned int numblocks;
4781 fill_blocks (const char *blocks_filename)
4785 stream = fopen (blocks_filename, "r");
4788 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4795 unsigned int i1, i2;
4796 char padding[200+1];
4797 char blockname[200+1];
4799 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4802 if (buf[0] == '\0' || buf[0] == '#')
4805 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4807 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4810 blocks[numblocks].start = i1;
4811 blocks[numblocks].end = i2;
4812 blocks[numblocks].name = strdup (blockname);
4813 /* It must be sorted. */
4814 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
4816 assert (numblocks != SIZEOF (blocks));
4819 if (ferror (stream) || fclose (stream))
4821 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4826 /* Return the smallest block index among the blocks for characters >= ch. */
4828 block_first_index (unsigned int ch)
4830 /* Binary search. */
4831 unsigned int lo = 0;
4832 unsigned int hi = numblocks;
4834 All blocks[i], i < lo, have blocks[i].end < ch,
4835 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4838 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4839 if (blocks[mid].end < ch)
4847 /* Return the largest block index among the blocks for characters <= ch,
4850 block_last_index (unsigned int ch)
4852 /* Binary search. */
4853 unsigned int lo = 0;
4854 unsigned int hi = numblocks;
4856 All blocks[i], i < lo, have blocks[i].start <= ch,
4857 all blocks[i], i >= hi, have blocks[i].start > ch. */
4860 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4861 if (blocks[mid].start <= ch)
4870 output_blocks (const char *version)
4872 const char *filename = "unictype/blocks.h";
4873 const unsigned int shift = 8; /* bits to shift away for array access */
4874 const unsigned int threshold = 0x28000; /* cut-off table here to save space */
4879 stream = fopen (filename, "w");
4882 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4886 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4887 fprintf (stream, "/* Unicode blocks. */\n");
4888 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4891 fprintf (stream, "static const uc_block_t blocks[] =\n");
4892 fprintf (stream, "{\n");
4893 for (i = 0; i < numblocks; i++)
4895 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4896 blocks[i].end, blocks[i].name);
4897 if (i+1 < numblocks)
4898 fprintf (stream, ",");
4899 fprintf (stream, "\n");
4901 fprintf (stream, "};\n");
4902 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4903 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4904 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4905 threshold >> shift);
4906 fprintf (stream, "{\n");
4907 for (i1 = 0; i1 < (threshold >> shift); i1++)
4909 unsigned int first_index = block_first_index (i1 << shift);
4910 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4911 fprintf (stream, " %3d, %3d", first_index, last_index);
4912 if (i1+1 < (threshold >> shift))
4913 fprintf (stream, ",");
4914 fprintf (stream, "\n");
4916 fprintf (stream, "};\n");
4917 fprintf (stream, "#define blocks_upper_first_index %d\n",
4918 block_first_index (threshold));
4919 fprintf (stream, "#define blocks_upper_last_index %d\n",
4920 block_last_index (0x10FFFF));
4922 if (ferror (stream) || fclose (stream))
4924 fprintf (stderr, "error writing to '%s'\n", filename);
4929 /* ========================================================================= */
4931 /* C and Java syntax. */
4935 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4936 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4937 UC_IDENTIFIER_INVALID, /* not valid */
4938 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4941 /* ISO C 99 section 6.4.(3). */
4943 is_c_whitespace (unsigned int ch)
4945 return (ch == ' ' /* space */
4946 || ch == '\t' /* horizontal tab */
4947 || ch == '\n' || ch == '\r' /* new-line */
4948 || ch == '\v' /* vertical tab */
4949 || ch == '\f'); /* form-feed */
4952 /* ISO C 99 section 6.4.2.1 and appendix D. */
4954 c_ident_category (unsigned int ch)
4956 /* Section 6.4.2.1. */
4957 if (ch >= '0' && ch <= '9')
4958 return UC_IDENTIFIER_VALID;
4959 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4960 return UC_IDENTIFIER_START;
4966 || (ch >= 0x00C0 && ch <= 0x00D6)
4967 || (ch >= 0x00D8 && ch <= 0x00F6)
4968 || (ch >= 0x00F8 && ch <= 0x01F5)
4969 || (ch >= 0x01FA && ch <= 0x0217)
4970 || (ch >= 0x0250 && ch <= 0x02A8)
4971 || (ch >= 0x1E00 && ch <= 0x1E9B)
4972 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4976 || (ch >= 0x0388 && ch <= 0x038A)
4978 || (ch >= 0x038E && ch <= 0x03A1)
4979 || (ch >= 0x03A3 && ch <= 0x03CE)
4980 || (ch >= 0x03D0 && ch <= 0x03D6)
4985 || (ch >= 0x03E2 && ch <= 0x03F3)
4986 || (ch >= 0x1F00 && ch <= 0x1F15)
4987 || (ch >= 0x1F18 && ch <= 0x1F1D)
4988 || (ch >= 0x1F20 && ch <= 0x1F45)
4989 || (ch >= 0x1F48 && ch <= 0x1F4D)
4990 || (ch >= 0x1F50 && ch <= 0x1F57)
4994 || (ch >= 0x1F5F && ch <= 0x1F7D)
4995 || (ch >= 0x1F80 && ch <= 0x1FB4)
4996 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4997 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4998 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4999 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5000 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5001 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5002 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5003 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5005 || (ch >= 0x0401 && ch <= 0x040C)
5006 || (ch >= 0x040E && ch <= 0x044F)
5007 || (ch >= 0x0451 && ch <= 0x045C)
5008 || (ch >= 0x045E && ch <= 0x0481)
5009 || (ch >= 0x0490 && ch <= 0x04C4)
5010 || (ch >= 0x04C7 && ch <= 0x04C8)
5011 || (ch >= 0x04CB && ch <= 0x04CC)
5012 || (ch >= 0x04D0 && ch <= 0x04EB)
5013 || (ch >= 0x04EE && ch <= 0x04F5)
5014 || (ch >= 0x04F8 && ch <= 0x04F9)
5016 || (ch >= 0x0531 && ch <= 0x0556)
5017 || (ch >= 0x0561 && ch <= 0x0587)
5019 || (ch >= 0x05B0 && ch <= 0x05B9)
5020 || (ch >= 0x05BB && ch <= 0x05BD)
5022 || (ch >= 0x05C1 && ch <= 0x05C2)
5023 || (ch >= 0x05D0 && ch <= 0x05EA)
5024 || (ch >= 0x05F0 && ch <= 0x05F2)
5026 || (ch >= 0x0621 && ch <= 0x063A)
5027 || (ch >= 0x0640 && ch <= 0x0652)
5028 || (ch >= 0x0670 && ch <= 0x06B7)
5029 || (ch >= 0x06BA && ch <= 0x06BE)
5030 || (ch >= 0x06C0 && ch <= 0x06CE)
5031 || (ch >= 0x06D0 && ch <= 0x06DC)
5032 || (ch >= 0x06E5 && ch <= 0x06E8)
5033 || (ch >= 0x06EA && ch <= 0x06ED)
5035 || (ch >= 0x0901 && ch <= 0x0903)
5036 || (ch >= 0x0905 && ch <= 0x0939)
5037 || (ch >= 0x093E && ch <= 0x094D)
5038 || (ch >= 0x0950 && ch <= 0x0952)
5039 || (ch >= 0x0958 && ch <= 0x0963)
5041 || (ch >= 0x0981 && ch <= 0x0983)
5042 || (ch >= 0x0985 && ch <= 0x098C)
5043 || (ch >= 0x098F && ch <= 0x0990)
5044 || (ch >= 0x0993 && ch <= 0x09A8)
5045 || (ch >= 0x09AA && ch <= 0x09B0)
5047 || (ch >= 0x09B6 && ch <= 0x09B9)
5048 || (ch >= 0x09BE && ch <= 0x09C4)
5049 || (ch >= 0x09C7 && ch <= 0x09C8)
5050 || (ch >= 0x09CB && ch <= 0x09CD)
5051 || (ch >= 0x09DC && ch <= 0x09DD)
5052 || (ch >= 0x09DF && ch <= 0x09E3)
5053 || (ch >= 0x09F0 && ch <= 0x09F1)
5056 || (ch >= 0x0A05 && ch <= 0x0A0A)
5057 || (ch >= 0x0A0F && ch <= 0x0A10)
5058 || (ch >= 0x0A13 && ch <= 0x0A28)
5059 || (ch >= 0x0A2A && ch <= 0x0A30)
5060 || (ch >= 0x0A32 && ch <= 0x0A33)
5061 || (ch >= 0x0A35 && ch <= 0x0A36)
5062 || (ch >= 0x0A38 && ch <= 0x0A39)
5063 || (ch >= 0x0A3E && ch <= 0x0A42)
5064 || (ch >= 0x0A47 && ch <= 0x0A48)
5065 || (ch >= 0x0A4B && ch <= 0x0A4D)
5066 || (ch >= 0x0A59 && ch <= 0x0A5C)
5070 || (ch >= 0x0A81 && ch <= 0x0A83)
5071 || (ch >= 0x0A85 && ch <= 0x0A8B)
5073 || (ch >= 0x0A8F && ch <= 0x0A91)
5074 || (ch >= 0x0A93 && ch <= 0x0AA8)
5075 || (ch >= 0x0AAA && ch <= 0x0AB0)
5076 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5077 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5078 || (ch >= 0x0ABD && ch <= 0x0AC5)
5079 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5080 || (ch >= 0x0ACB && ch <= 0x0ACD)
5084 || (ch >= 0x0B01 && ch <= 0x0B03)
5085 || (ch >= 0x0B05 && ch <= 0x0B0C)
5086 || (ch >= 0x0B0F && ch <= 0x0B10)
5087 || (ch >= 0x0B13 && ch <= 0x0B28)
5088 || (ch >= 0x0B2A && ch <= 0x0B30)
5089 || (ch >= 0x0B32 && ch <= 0x0B33)
5090 || (ch >= 0x0B36 && ch <= 0x0B39)
5091 || (ch >= 0x0B3E && ch <= 0x0B43)
5092 || (ch >= 0x0B47 && ch <= 0x0B48)
5093 || (ch >= 0x0B4B && ch <= 0x0B4D)
5094 || (ch >= 0x0B5C && ch <= 0x0B5D)
5095 || (ch >= 0x0B5F && ch <= 0x0B61)
5097 || (ch >= 0x0B82 && ch <= 0x0B83)
5098 || (ch >= 0x0B85 && ch <= 0x0B8A)
5099 || (ch >= 0x0B8E && ch <= 0x0B90)
5100 || (ch >= 0x0B92 && ch <= 0x0B95)
5101 || (ch >= 0x0B99 && ch <= 0x0B9A)
5103 || (ch >= 0x0B9E && ch <= 0x0B9F)
5104 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5105 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5106 || (ch >= 0x0BAE && ch <= 0x0BB5)
5107 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5108 || (ch >= 0x0BBE && ch <= 0x0BC2)
5109 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5110 || (ch >= 0x0BCA && ch <= 0x0BCD)
5112 || (ch >= 0x0C01 && ch <= 0x0C03)
5113 || (ch >= 0x0C05 && ch <= 0x0C0C)
5114 || (ch >= 0x0C0E && ch <= 0x0C10)
5115 || (ch >= 0x0C12 && ch <= 0x0C28)
5116 || (ch >= 0x0C2A && ch <= 0x0C33)
5117 || (ch >= 0x0C35 && ch <= 0x0C39)
5118 || (ch >= 0x0C3E && ch <= 0x0C44)
5119 || (ch >= 0x0C46 && ch <= 0x0C48)
5120 || (ch >= 0x0C4A && ch <= 0x0C4D)
5121 || (ch >= 0x0C60 && ch <= 0x0C61)
5123 || (ch >= 0x0C82 && ch <= 0x0C83)
5124 || (ch >= 0x0C85 && ch <= 0x0C8C)
5125 || (ch >= 0x0C8E && ch <= 0x0C90)
5126 || (ch >= 0x0C92 && ch <= 0x0CA8)
5127 || (ch >= 0x0CAA && ch <= 0x0CB3)
5128 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5129 || (ch >= 0x0CBE && ch <= 0x0CC4)
5130 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5131 || (ch >= 0x0CCA && ch <= 0x0CCD)
5133 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5135 || (ch >= 0x0D02 && ch <= 0x0D03)
5136 || (ch >= 0x0D05 && ch <= 0x0D0C)
5137 || (ch >= 0x0D0E && ch <= 0x0D10)
5138 || (ch >= 0x0D12 && ch <= 0x0D28)
5139 || (ch >= 0x0D2A && ch <= 0x0D39)
5140 || (ch >= 0x0D3E && ch <= 0x0D43)
5141 || (ch >= 0x0D46 && ch <= 0x0D48)
5142 || (ch >= 0x0D4A && ch <= 0x0D4D)
5143 || (ch >= 0x0D60 && ch <= 0x0D61)
5145 || (ch >= 0x0E01 && ch <= 0x0E3A)
5146 || (ch >= 0x0E40 && ch <= 0x0E5B)
5148 || (ch >= 0x0E81 && ch <= 0x0E82)
5150 || (ch >= 0x0E87 && ch <= 0x0E88)
5153 || (ch >= 0x0E94 && ch <= 0x0E97)
5154 || (ch >= 0x0E99 && ch <= 0x0E9F)
5155 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5158 || (ch >= 0x0EAA && ch <= 0x0EAB)
5159 || (ch >= 0x0EAD && ch <= 0x0EAE)
5160 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5161 || (ch >= 0x0EBB && ch <= 0x0EBD)
5162 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5164 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5165 || (ch >= 0x0EDC && ch <= 0x0EDD)
5168 || (ch >= 0x0F18 && ch <= 0x0F19)
5172 || (ch >= 0x0F3E && ch <= 0x0F47)
5173 || (ch >= 0x0F49 && ch <= 0x0F69)
5174 || (ch >= 0x0F71 && ch <= 0x0F84)
5175 || (ch >= 0x0F86 && ch <= 0x0F8B)
5176 || (ch >= 0x0F90 && ch <= 0x0F95)
5178 || (ch >= 0x0F99 && ch <= 0x0FAD)
5179 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5182 || (ch >= 0x10A0 && ch <= 0x10C5)
5183 || (ch >= 0x10D0 && ch <= 0x10F6)
5185 || (ch >= 0x3041 && ch <= 0x3093)
5186 || (ch >= 0x309B && ch <= 0x309C)
5188 || (ch >= 0x30A1 && ch <= 0x30F6)
5189 || (ch >= 0x30FB && ch <= 0x30FC)
5191 || (ch >= 0x3105 && ch <= 0x312C)
5192 /* CJK Unified Ideographs */
5193 || (ch >= 0x4E00 && ch <= 0x9FA5)
5195 || (ch >= 0xAC00 && ch <= 0xD7A3)
5197 || (ch >= 0x0660 && ch <= 0x0669)
5198 || (ch >= 0x06F0 && ch <= 0x06F9)
5199 || (ch >= 0x0966 && ch <= 0x096F)
5200 || (ch >= 0x09E6 && ch <= 0x09EF)
5201 || (ch >= 0x0A66 && ch <= 0x0A6F)
5202 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5203 || (ch >= 0x0B66 && ch <= 0x0B6F)
5204 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5205 || (ch >= 0x0C66 && ch <= 0x0C6F)
5206 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5207 || (ch >= 0x0D66 && ch <= 0x0D6F)
5208 || (ch >= 0x0E50 && ch <= 0x0E59)
5209 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5210 || (ch >= 0x0F20 && ch <= 0x0F33)
5211 /* Special characters */
5214 || (ch >= 0x02B0 && ch <= 0x02B8)
5216 || (ch >= 0x02BD && ch <= 0x02C1)
5217 || (ch >= 0x02D0 && ch <= 0x02D1)
5218 || (ch >= 0x02E0 && ch <= 0x02E4)
5224 || (ch >= 0x203F && ch <= 0x2040)
5227 || (ch >= 0x210A && ch <= 0x2113)
5229 || (ch >= 0x2118 && ch <= 0x211D)
5233 || (ch >= 0x212A && ch <= 0x2131)
5234 || (ch >= 0x2133 && ch <= 0x2138)
5235 || (ch >= 0x2160 && ch <= 0x2182)
5236 || (ch >= 0x3005 && ch <= 0x3007)
5237 || (ch >= 0x3021 && ch <= 0x3029)
5239 return UC_IDENTIFIER_START;
5240 return UC_IDENTIFIER_INVALID;
5243 /* The Java Language Specification, 3rd edition, §3.6.
5244 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
5246 is_java_whitespace (unsigned int ch)
5248 return (ch == ' ' || ch == '\t' || ch == '\f'
5249 || ch == '\n' || ch == '\r');
5252 /* The Java Language Specification, 3rd edition, §3.8.
5253 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
5254 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5256 java_ident_category (unsigned int ch)
5258 /* FIXME: Check this against Sun's JDK implementation. */
5259 if (is_category_L (ch) /* = Character.isLetter(ch) */
5260 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5261 || is_category_Sc (ch) /* currency symbol */
5262 || is_category_Pc (ch) /* connector punctuation */
5264 return UC_IDENTIFIER_START;
5265 if (is_category_Nd (ch) /* digit */
5266 || is_category_Mc (ch) /* combining mark */
5267 || is_category_Mn (ch) /* non-spacing mark */
5269 return UC_IDENTIFIER_VALID;
5270 if ((ch >= 0x0000 && ch <= 0x0008)
5271 || (ch >= 0x000E && ch <= 0x001B)
5272 || (ch >= 0x007F && ch <= 0x009F)
5273 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5275 return UC_IDENTIFIER_IGNORABLE;
5276 return UC_IDENTIFIER_INVALID;
5279 /* Construction of sparse 3-level tables. */
5280 #define TABLE identsyntax_table
5281 #define ELEMENT uint8_t
5282 #define DEFAULT UC_IDENTIFIER_INVALID
5283 #define xmalloc malloc
5284 #define xrealloc realloc
5287 /* Output an identifier syntax categorization in a three-level bitmap. */
5289 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5293 struct identsyntax_table t;
5294 unsigned int level1_offset, level2_offset, level3_offset;
5296 stream = fopen (filename, "w");
5299 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5303 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5304 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5305 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5310 identsyntax_table_init (&t);
5312 for (ch = 0; ch < 0x110000; ch++)
5314 int syntaxcode = predicate (ch);
5316 assert (syntaxcode <= 0x03);
5318 if (syntaxcode != UC_IDENTIFIER_INVALID)
5319 identsyntax_table_add (&t, ch, syntaxcode);
5322 identsyntax_table_finalize (&t);
5324 /* Offsets in t.result, in memory of this process. */
5326 5 * sizeof (uint32_t);
5328 5 * sizeof (uint32_t)
5329 + t.level1_size * sizeof (uint32_t);
5331 5 * sizeof (uint32_t)
5332 + t.level1_size * sizeof (uint32_t)
5333 + (t.level2_size << t.q) * sizeof (uint32_t);
5335 for (i = 0; i < 5; i++)
5336 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5337 ((uint32_t *) t.result)[i]);
5338 fprintf (stream, "static const\n");
5339 fprintf (stream, "struct\n");
5340 fprintf (stream, " {\n");
5341 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5342 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5343 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5344 (1 << t.p) * 2 / 16);
5345 fprintf (stream, " }\n");
5346 fprintf (stream, "%s =\n", name);
5347 fprintf (stream, "{\n");
5348 fprintf (stream, " {");
5349 if (t.level1_size > 8)
5350 fprintf (stream, "\n ");
5351 for (i = 0; i < t.level1_size; i++)
5354 if (i > 0 && (i % 8) == 0)
5355 fprintf (stream, "\n ");
5356 offset = ((uint32_t *) (t.result + level1_offset))[i];
5358 fprintf (stream, " %5d", -1);
5360 fprintf (stream, " %5zu",
5361 (offset - level2_offset) / sizeof (uint32_t));
5362 if (i+1 < t.level1_size)
5363 fprintf (stream, ",");
5365 if (t.level1_size > 8)
5366 fprintf (stream, "\n ");
5367 fprintf (stream, " },\n");
5368 fprintf (stream, " {");
5369 if (t.level2_size << t.q > 8)
5370 fprintf (stream, "\n ");
5371 for (i = 0; i < t.level2_size << t.q; i++)
5374 if (i > 0 && (i % 8) == 0)
5375 fprintf (stream, "\n ");
5376 offset = ((uint32_t *) (t.result + level2_offset))[i];
5378 fprintf (stream, " %5d", -1);
5380 fprintf (stream, " %5zu",
5381 (offset - level3_offset) / sizeof (uint8_t));
5382 if (i+1 < t.level2_size << t.q)
5383 fprintf (stream, ",");
5385 if (t.level2_size << t.q > 8)
5386 fprintf (stream, "\n ");
5387 fprintf (stream, " },\n");
5388 /* Pack the level3 array. Each entry needs 2 bits only. */
5389 fprintf (stream, " {");
5390 if ((t.level3_size << t.p) * 2 / 16 > 8)
5391 fprintf (stream, "\n ");
5392 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5394 if (i > 0 && (i % 8) == 0)
5395 fprintf (stream, "\n ");
5396 fprintf (stream, " 0x%04x",
5397 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5398 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5399 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5400 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5401 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5402 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5403 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5404 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5405 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5406 fprintf (stream, ",");
5408 if ((t.level3_size << t.p) * 2 / 16 > 8)
5409 fprintf (stream, "\n ");
5410 fprintf (stream, " }\n");
5411 fprintf (stream, "};\n");
5413 if (ferror (stream) || fclose (stream))
5415 fprintf (stderr, "error writing to '%s'\n", filename);
5421 output_ident_properties (const char *version)
5423 #define PROPERTY(P) \
5424 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5425 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5426 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5427 PROPERTY(c_whitespace)
5428 PROPERTY(java_whitespace)
5431 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5432 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5435 /* ========================================================================= */
5437 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5438 glibc/localedata/locales/i18n file, generated by
5439 glibc/localedata/gen-unicode-ctype.c. */
5441 /* Character mappings. */
5444 to_upper (unsigned int ch)
5446 if (unicode_attributes[ch].name != NULL
5447 && unicode_attributes[ch].upper != NONE)
5448 return unicode_attributes[ch].upper;
5454 to_lower (unsigned int ch)
5456 if (unicode_attributes[ch].name != NULL
5457 && unicode_attributes[ch].lower != NONE)
5458 return unicode_attributes[ch].lower;
5464 to_title (unsigned int ch)
5466 if (unicode_attributes[ch].name != NULL
5467 && unicode_attributes[ch].title != NONE)
5468 return unicode_attributes[ch].title;
5473 /* Character class properties. */
5476 is_upper (unsigned int ch)
5478 return (to_lower (ch) != ch);
5482 is_lower (unsigned int ch)
5484 return (to_upper (ch) != ch)
5485 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5490 is_alpha (unsigned int ch)
5492 return (unicode_attributes[ch].name != NULL
5493 && ((unicode_attributes[ch].category[0] == 'L'
5494 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5495 <U0E2F>, <U0E46> should belong to is_punct. */
5496 && (ch != 0x0E2F) && (ch != 0x0E46))
5497 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5498 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5500 || (ch >= 0x0E34 && ch <= 0x0E3A)
5501 || (ch >= 0x0E47 && ch <= 0x0E4E)
5502 /* Avoid warning for <U0345>. */
5504 /* Avoid warnings for <U2160>..<U217F>. */
5505 || (unicode_attributes[ch].category[0] == 'N'
5506 && unicode_attributes[ch].category[1] == 'l')
5507 /* Avoid warnings for <U24B6>..<U24E9>. */
5508 || (unicode_attributes[ch].category[0] == 'S'
5509 && unicode_attributes[ch].category[1] == 'o'
5510 && strstr (unicode_attributes[ch].name, " LETTER ")
5512 /* Consider all the non-ASCII digits as alphabetic.
5513 ISO C 99 forbids us to have them in category "digit",
5514 but we want iswalnum to return true on them. */
5515 || (unicode_attributes[ch].category[0] == 'N'
5516 && unicode_attributes[ch].category[1] == 'd'
5517 && !(ch >= 0x0030 && ch <= 0x0039))));
5521 is_digit (unsigned int ch)
5524 return (unicode_attributes[ch].name != NULL
5525 && unicode_attributes[ch].category[0] == 'N'
5526 && unicode_attributes[ch].category[1] == 'd');
5527 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5528 a zero. Must add <0> in front of them by hand. */
5530 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5533 The iswdigit function tests for any wide character that corresponds
5534 to a decimal-digit character (as defined in 5.2.1).
5536 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5538 return (ch >= 0x0030 && ch <= 0x0039);
5543 is_alnum (unsigned int ch)
5545 return is_alpha (ch) || is_digit (ch);
5549 is_blank (unsigned int ch)
5551 return (ch == 0x0009 /* '\t' */
5552 /* Category Zs without mention of "<noBreak>" */
5553 || (unicode_attributes[ch].name != NULL
5554 && unicode_attributes[ch].category[0] == 'Z'
5555 && unicode_attributes[ch].category[1] == 's'
5556 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5560 is_space (unsigned int ch)
5562 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5563 should treat it like a punctuation character, not like a space. */
5564 return (ch == 0x0020 /* ' ' */
5565 || ch == 0x000C /* '\f' */
5566 || ch == 0x000A /* '\n' */
5567 || ch == 0x000D /* '\r' */
5568 || ch == 0x0009 /* '\t' */
5569 || ch == 0x000B /* '\v' */
5570 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5571 || (unicode_attributes[ch].name != NULL
5572 && unicode_attributes[ch].category[0] == 'Z'
5573 && (unicode_attributes[ch].category[1] == 'l'
5574 || unicode_attributes[ch].category[1] == 'p'
5575 || (unicode_attributes[ch].category[1] == 's'
5576 && !strstr (unicode_attributes[ch].decomposition,
5581 is_cntrl (unsigned int ch)
5583 return (unicode_attributes[ch].name != NULL
5584 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5585 /* Categories Zl and Zp */
5586 || (unicode_attributes[ch].category[0] == 'Z'
5587 && (unicode_attributes[ch].category[1] == 'l'
5588 || unicode_attributes[ch].category[1] == 'p'))));
5592 is_xdigit (unsigned int ch)
5595 return is_digit (ch)
5596 || (ch >= 0x0041 && ch <= 0x0046)
5597 || (ch >= 0x0061 && ch <= 0x0066);
5599 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5602 The iswxdigit function tests for any wide character that corresponds
5603 to a hexadecimal-digit character (as defined in 6.4.4.1).
5605 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5607 return (ch >= 0x0030 && ch <= 0x0039)
5608 || (ch >= 0x0041 && ch <= 0x0046)
5609 || (ch >= 0x0061 && ch <= 0x0066);
5614 is_graph (unsigned int ch)
5616 return (unicode_attributes[ch].name != NULL
5617 && strcmp (unicode_attributes[ch].name, "<control>")
5622 is_print (unsigned int ch)
5624 return (unicode_attributes[ch].name != NULL
5625 && strcmp (unicode_attributes[ch].name, "<control>")
5626 /* Categories Zl and Zp */
5627 && !(unicode_attributes[ch].name != NULL
5628 && unicode_attributes[ch].category[0] == 'Z'
5629 && (unicode_attributes[ch].category[1] == 'l'
5630 || unicode_attributes[ch].category[1] == 'p')));
5634 is_punct (unsigned int ch)
5637 return (unicode_attributes[ch].name != NULL
5638 && unicode_attributes[ch].category[0] == 'P');
5640 /* The traditional POSIX definition of punctuation is every graphic,
5641 non-alphanumeric character. */
5642 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5646 /* Output all properties. */
5648 output_old_ctype (const char *version)
5650 #define PROPERTY(P) \
5651 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5652 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5653 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5672 is_combining (unsigned int ch)
5674 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5675 file. In 3.0.1 it was identical to the union of the general categories
5676 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5677 PropList.txt file, so we take the latter definition. */
5678 return (unicode_attributes[ch].name != NULL
5679 && unicode_attributes[ch].category[0] == 'M'
5680 && (unicode_attributes[ch].category[1] == 'n'
5681 || unicode_attributes[ch].category[1] == 'c'
5682 || unicode_attributes[ch].category[1] == 'e'));
5686 is_combining_level3 (unsigned int ch)
5688 return is_combining (ch)
5689 && !(unicode_attributes[ch].combining[0] != '\0'
5690 && unicode_attributes[ch].combining[0] != '0'
5691 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5694 /* Return the UCS symbol string for a Unicode character. */
5696 ucs_symbol (unsigned int i)
5698 static char buf[11+1];
5700 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5704 /* Return the UCS symbol range string for a Unicode characters interval. */
5706 ucs_symbol_range (unsigned int low, unsigned int high)
5708 static char buf[24+1];
5710 strcpy (buf, ucs_symbol (low));
5712 strcat (buf, ucs_symbol (high));
5716 /* Output a character class (= property) table. */
5719 output_charclass (FILE *stream, const char *classname,
5720 bool (*func) (unsigned int))
5722 char table[0x110000];
5724 bool need_semicolon;
5725 const int max_column = 75;
5728 for (i = 0; i < 0x110000; i++)
5729 table[i] = (int) func (i);
5731 fprintf (stream, "%s ", classname);
5732 need_semicolon = false;
5734 for (i = 0; i < 0x110000; )
5740 unsigned int low, high;
5746 while (i < 0x110000 && table[i]);
5750 strcpy (buf, ucs_symbol (low));
5752 strcpy (buf, ucs_symbol_range (low, high));
5756 fprintf (stream, ";");
5760 if (column + strlen (buf) > max_column)
5762 fprintf (stream, "/\n ");
5766 fprintf (stream, "%s", buf);
5767 column += strlen (buf);
5768 need_semicolon = true;
5771 fprintf (stream, "\n");
5774 /* Output a character mapping table. */
5777 output_charmap (FILE *stream, const char *mapname,
5778 unsigned int (*func) (unsigned int))
5780 char table[0x110000];
5782 bool need_semicolon;
5783 const int max_column = 75;
5786 for (i = 0; i < 0x110000; i++)
5787 table[i] = (func (i) != i);
5789 fprintf (stream, "%s ", mapname);
5790 need_semicolon = false;
5792 for (i = 0; i < 0x110000; i++)
5798 strcat (buf, ucs_symbol (i));
5800 strcat (buf, ucs_symbol (func (i)));
5805 fprintf (stream, ";");
5809 if (column + strlen (buf) > max_column)
5811 fprintf (stream, "/\n ");
5815 fprintf (stream, "%s", buf);
5816 column += strlen (buf);
5817 need_semicolon = true;
5819 fprintf (stream, "\n");
5822 /* Output the width table. */
5825 output_widthmap (FILE *stream)
5829 /* Output the tables to the given file. */
5832 output_tables (const char *filename, const char *version)
5837 stream = fopen (filename, "w");
5840 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5844 fprintf (stream, "escape_char /\n");
5845 fprintf (stream, "comment_char %%\n");
5846 fprintf (stream, "\n");
5847 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
5849 fprintf (stream, "\n");
5851 fprintf (stream, "LC_IDENTIFICATION\n");
5852 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5853 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5854 fprintf (stream, "address \"\"\n");
5855 fprintf (stream, "contact \"\"\n");
5856 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5857 fprintf (stream, "tel \"\"\n");
5858 fprintf (stream, "fax \"\"\n");
5859 fprintf (stream, "language \"\"\n");
5860 fprintf (stream, "territory \"Earth\"\n");
5861 fprintf (stream, "revision \"%s\"\n", version);
5866 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5867 fprintf (stream, "date \"%s\"\n", date);
5869 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5870 fprintf (stream, "END LC_IDENTIFICATION\n");
5871 fprintf (stream, "\n");
5874 for (ch = 0; ch < 0x110000; ch++)
5876 /* toupper restriction: "Only characters specified for the keywords
5877 lower and upper shall be specified. */
5878 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5880 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5881 ucs_symbol (ch), ch, to_upper (ch));
5883 /* tolower restriction: "Only characters specified for the keywords
5884 lower and upper shall be specified. */
5885 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5887 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5888 ucs_symbol (ch), ch, to_lower (ch));
5890 /* alpha restriction: "Characters classified as either upper or lower
5891 shall automatically belong to this class. */
5892 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5893 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5895 /* alpha restriction: "No character specified for the keywords cntrl,
5896 digit, punct or space shall be specified." */
5897 if (is_alpha (ch) && is_cntrl (ch))
5898 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5899 if (is_alpha (ch) && is_digit (ch))
5900 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5901 if (is_alpha (ch) && is_punct (ch))
5902 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5903 if (is_alpha (ch) && is_space (ch))
5904 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5906 /* space restriction: "No character specified for the keywords upper,
5907 lower, alpha, digit, graph or xdigit shall be specified."
5908 upper, lower, alpha already checked above. */
5909 if (is_space (ch) && is_digit (ch))
5910 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5911 if (is_space (ch) && is_graph (ch))
5912 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5913 if (is_space (ch) && is_xdigit (ch))
5914 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5916 /* cntrl restriction: "No character specified for the keywords upper,
5917 lower, alpha, digit, punct, graph, print or xdigit shall be
5918 specified." upper, lower, alpha already checked above. */
5919 if (is_cntrl (ch) && is_digit (ch))
5920 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5921 if (is_cntrl (ch) && is_punct (ch))
5922 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5923 if (is_cntrl (ch) && is_graph (ch))
5924 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5925 if (is_cntrl (ch) && is_print (ch))
5926 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5927 if (is_cntrl (ch) && is_xdigit (ch))
5928 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5930 /* punct restriction: "No character specified for the keywords upper,
5931 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5932 be specified." upper, lower, alpha, cntrl already checked above. */
5933 if (is_punct (ch) && is_digit (ch))
5934 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5935 if (is_punct (ch) && is_xdigit (ch))
5936 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5937 if (is_punct (ch) && (ch == 0x0020))
5938 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5940 /* graph restriction: "No character specified for the keyword cntrl
5941 shall be specified." Already checked above. */
5943 /* print restriction: "No character specified for the keyword cntrl
5944 shall be specified." Already checked above. */
5946 /* graph - print relation: differ only in the <space> character.
5947 How is this possible if there are more than one space character?!
5948 I think susv2/xbd/locale.html should speak of "space characters",
5949 not "space character". */
5950 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5952 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5953 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5955 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5958 fprintf (stream, "LC_CTYPE\n");
5959 output_charclass (stream, "upper", is_upper);
5960 output_charclass (stream, "lower", is_lower);
5961 output_charclass (stream, "alpha", is_alpha);
5962 output_charclass (stream, "digit", is_digit);
5963 output_charclass (stream, "outdigit", is_outdigit);
5964 output_charclass (stream, "blank", is_blank);
5965 output_charclass (stream, "space", is_space);
5966 output_charclass (stream, "cntrl", is_cntrl);
5967 output_charclass (stream, "punct", is_punct);
5968 output_charclass (stream, "xdigit", is_xdigit);
5969 output_charclass (stream, "graph", is_graph);
5970 output_charclass (stream, "print", is_print);
5971 output_charclass (stream, "class \"combining\";", is_combining);
5972 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5973 output_charmap (stream, "toupper", to_upper);
5974 output_charmap (stream, "tolower", to_lower);
5975 output_charmap (stream, "map \"totitle\";", to_title);
5976 output_widthmap (stream);
5977 fprintf (stream, "END LC_CTYPE\n");
5979 if (ferror (stream) || fclose (stream))
5981 fprintf (stderr, "error writing to '%s'\n", filename);
5988 /* ========================================================================= */
5990 /* The width property from the EastAsianWidth.txt file.
5991 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5992 const char * unicode_width[0x110000];
5994 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5997 fill_width (const char *width_filename)
6001 char field0[FIELDLEN];
6002 char field1[FIELDLEN];
6003 char field2[FIELDLEN];
6006 for (i = 0; i < 0x110000; i++)
6007 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6009 stream = fopen (width_filename, "r");
6012 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6027 do c = getc (stream); while (c != EOF && c != '\n');
6031 n = getfield (stream, field0, ';');
6032 n += getfield (stream, field1, ' ');
6033 n += getfield (stream, field2, '\n');
6038 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6041 i = strtoul (field0, NULL, 16);
6042 if (strstr (field0, "..") != NULL)
6044 /* Deal with a range. */
6045 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6047 unicode_width[i] = strdup (field1);
6051 /* Single character line. */
6052 unicode_width[i] = strdup (field1);
6056 if (ferror (stream) || fclose (stream))
6058 fprintf (stderr, "error reading from '%s'\n", width_filename);
6063 /* ========================================================================= */
6065 /* Non-spacing attribute and width. */
6067 /* The non-spacing attribute table consists of:
6068 - Non-spacing characters; generated from PropList.txt or
6069 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6070 - Format control characters; generated from
6071 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
6072 - Zero width characters; generated from
6073 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6077 is_nonspacing (unsigned int ch)
6079 return (unicode_attributes[ch].name != NULL
6080 && (get_bidi_category (ch) == UC_BIDI_NSM
6081 || is_category_Cc (ch) || is_category_Cf (ch)
6082 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
6086 output_nonspacing_property (const char *filename)
6089 int ind[0x110000 / 0x200];
6094 stream = fopen (filename, "w");
6097 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6102 for (i = 0; i < 0x110000 / 0x200; i++)
6104 bool nontrivial = false;
6107 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6108 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6109 if (is_nonspacing (ch))
6115 ind[i] = next_ind++;
6120 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6123 for (i = 0; i < 0x110000 / 0x200; i++)
6125 bool nontrivial = (ind[i] >= 0);
6131 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6132 for (j = 0; j < 8; j++)
6136 fprintf (stream, " ");
6137 for (k = 0; k < 8; k++)
6140 unsigned char bits = 0;
6142 for (l = 0; l < 8; l++)
6144 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6146 if (is_nonspacing (ch))
6149 fprintf (stream, " 0x%02x%c", bits,
6150 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6152 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6153 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6158 fprintf (stream, "};\n");
6160 i_max = ((i_max + 8 - 1) / 8) * 8;
6161 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6166 for (j = 0; j < i_max / 8; j++)
6170 fprintf (stream, " ");
6171 for (k = 0; k < 8; k++)
6174 fprintf (stream, " %2d%c", ind[i],
6175 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6177 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6178 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6181 fprintf (stream, "};\n");
6183 if (ferror (stream) || fclose (stream))
6185 fprintf (stderr, "error writing to '%s'\n", filename);
6190 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6192 symbolic_width (unsigned int ch)
6194 /* Test for unassigned character. */
6195 if (is_property_unassigned_code_value (ch))
6197 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6198 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6200 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6201 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6202 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6203 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6204 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6210 /* Test for non-spacing or control character. */
6211 if (is_category_Cc (ch) && ch < 0x00A0)
6213 if (is_nonspacing (ch))
6215 /* Test for double-width character. */
6216 if (unicode_width[ch] != NULL
6217 && (strcmp (unicode_width[ch], "W") == 0
6218 || strcmp (unicode_width[ch], "F") == 0))
6220 /* Test for half-width character. */
6221 if (unicode_width[ch] != NULL
6222 && strcmp (unicode_width[ch], "H") == 0)
6225 /* In ancient CJK encodings, Cyrillic and most other characters are
6226 double-width as well. */
6227 if (ch >= 0x00A1 && ch < 0x10000)
6233 output_width_property_test (const char *filename)
6236 unsigned int interval_start, interval_end, ch;
6237 char interval_value;
6239 stream = fopen (filename, "w");
6242 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6247 interval_start = interval_end = 0; /* avoid GCC warning */
6248 for (ch = 0; ch < 0x110000; ch++)
6250 char value = symbolic_width (ch);
6251 if (value != 0) /* skip Cc control characters and unassigned characters */
6253 if (value == interval_value)
6254 /* Extend the interval. */
6258 /* Terminate the interval. */
6259 if (interval_value != 0)
6261 if (interval_end == interval_start)
6262 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6264 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6266 /* Start a new interval. */
6267 interval_start = interval_end = ch;
6268 interval_value = value;
6272 /* Terminate the last interval. */
6273 if (interval_value != 0)
6275 if (interval_end == interval_start)
6276 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6278 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6281 if (ferror (stream) || fclose (stream))
6283 fprintf (stderr, "error writing to '%s'\n", filename);
6288 /* ========================================================================= */
6290 /* Line breaking classification.
6291 Updated for Unicode TR #14 revision 26. */
6295 /* Values >= 27 are resolved at run time. */
6296 LBP_BK = 27, /* mandatory break */
6297 /*LBP_CR, carriage return - not used here because it's a DOSism */
6298 /*LBP_LF, line feed - not used here because it's a DOSism */
6299 LBP_CM = 28, /* attached characters and combining marks */
6300 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6301 /*LBP_SG, surrogates - not used here because they are not characters */
6302 LBP_WJ = 0, /* word joiner */
6303 LBP_ZW = 29, /* zero width space */
6304 LBP_GL = 1, /* non-breaking (glue) */
6305 LBP_SP = 30, /* space */
6306 LBP_B2 = 2, /* break opportunity before and after */
6307 LBP_BA = 3, /* break opportunity after */
6308 LBP_BB = 4, /* break opportunity before */
6309 LBP_HY = 5, /* hyphen */
6310 LBP_CB = 31, /* contingent break opportunity */
6311 LBP_CL = 6, /* closing punctuation */
6312 LBP_CP = 7, /* closing parenthesis */
6313 LBP_EX = 8, /* exclamation/interrogation */
6314 LBP_IN = 9, /* inseparable */
6315 LBP_NS = 10, /* non starter */
6316 LBP_OP = 11, /* opening punctuation */
6317 LBP_QU = 12, /* ambiguous quotation */
6318 LBP_IS = 13, /* infix separator (numeric) */
6319 LBP_NU = 14, /* numeric */
6320 LBP_PO = 15, /* postfix (numeric) */
6321 LBP_PR = 16, /* prefix (numeric) */
6322 LBP_SY = 17, /* symbols allowing breaks */
6323 LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
6324 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6325 /*LBP_CJ, conditional Japanese starter, resolved to NS */
6326 LBP_H2 = 19, /* Hangul LV syllable */
6327 LBP_H3 = 20, /* Hangul LVT syllable */
6328 LBP_HL = 25, /* Hebrew letter */
6329 LBP_ID = 21, /* ideographic */
6330 LBP_JL = 22, /* Hangul L Jamo */
6331 LBP_JV = 23, /* Hangul V Jamo */
6332 LBP_JT = 24, /* Hangul T Jamo */
6333 LBP_RI = 26, /* regional indicator */
6334 LBP_SA = 33, /* complex context (South East Asian) */
6335 LBP_XX = 34 /* unknown */
6338 /* Returns the line breaking classification for ch, as a bit mask. */
6340 get_lbp (unsigned int ch)
6344 /* U+20BC..U+20CF are reserved for prefixes. */
6345 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
6346 return (int64_t) 1 << LBP_PR;
6348 if (unicode_attributes[ch].name != NULL)
6350 /* mandatory break */
6351 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6352 || ch == 0x000C /* form feed */
6353 || ch == 0x000B /* line tabulation */
6354 || ch == 0x2028 /* LINE SEPARATOR */
6355 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6356 attr |= (int64_t) 1 << LBP_BK;
6358 if (ch == 0x2060 /* WORD JOINER */
6359 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6360 attr |= (int64_t) 1 << LBP_WJ;
6362 /* zero width space */
6363 if (ch == 0x200B /* ZERO WIDTH SPACE */)
6364 attr |= (int64_t) 1 << LBP_ZW;
6366 /* non-breaking (glue) */
6367 if (ch == 0x00A0 /* NO-BREAK SPACE */
6368 || ch == 0x202F /* NARROW NO-BREAK SPACE */
6369 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6370 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6371 || ch == 0x2007 /* FIGURE SPACE */
6372 || ch == 0x2011 /* NON-BREAKING HYPHEN */
6373 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6374 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6375 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6376 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6377 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6378 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6379 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6380 attr |= (int64_t) 1 << LBP_GL;
6383 if (ch == 0x0020 /* SPACE */)
6384 attr |= (int64_t) 1 << LBP_SP;
6386 /* break opportunity before and after */
6387 if (ch == 0x2014 /* EM DASH */
6388 || ch == 0x2E3A /* TWO-EM DASH */
6389 || ch == 0x2E3B /* THREE-EM DASH */)
6390 attr |= (int64_t) 1 << LBP_B2;
6392 /* break opportunity after */
6393 if (/* Breaking Spaces */
6394 ch == 0x1680 /* OGHAM SPACE MARK */
6395 || ch == 0x2000 /* EN QUAD */
6396 || ch == 0x2001 /* EM QUAD */
6397 || ch == 0x2002 /* EN SPACE */
6398 || ch == 0x2003 /* EM SPACE */
6399 || ch == 0x2004 /* THREE-PER-EM SPACE */
6400 || ch == 0x2005 /* FOUR-PER-EM SPACE */
6401 || ch == 0x2006 /* SIX-PER-EM SPACE */
6402 || ch == 0x2008 /* PUNCTUATION SPACE */
6403 || ch == 0x2009 /* THIN SPACE */
6404 || ch == 0x200A /* HAIR SPACE */
6405 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6406 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6408 || ch == 0x0009 /* tab */
6409 /* Conditional Hyphens */
6410 || ch == 0x00AD /* SOFT HYPHEN */
6411 /* Breaking Hyphens */
6412 || ch == 0x058A /* ARMENIAN HYPHEN */
6413 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6414 || ch == 0x2010 /* HYPHEN */
6415 || ch == 0x2012 /* FIGURE DASH */
6416 || ch == 0x2013 /* EN DASH */
6417 /* Visible Word Dividers */
6418 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6419 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6420 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6421 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6422 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6423 || ch == 0x2027 /* HYPHENATION POINT */
6424 || ch == 0x007C /* VERTICAL LINE */
6425 /* Historic Word Separators */
6426 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6427 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6428 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6429 || ch == 0x2056 /* THREE DOT PUNCTUATION */
6430 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6431 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6432 || ch == 0x205A /* TWO DOT PUNCTUATION */
6433 || ch == 0x205B /* FOUR DOT MARK */
6434 || ch == 0x205D /* TRICOLON */
6435 || ch == 0x205E /* VERTICAL FOUR DOTS */
6436 || ch == 0x2E19 /* PALM BRANCH */
6437 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6438 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6439 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6440 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6441 || ch == 0x2E30 /* RING POINT */
6442 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6443 || ch == 0x2E33 /* RAISED DOT */
6444 || ch == 0x2E34 /* RAISED COMMA */
6445 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6446 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6447 || ch == 0x10102 /* AEGEAN CHECK MARK */
6448 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6449 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6450 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6451 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6453 || ch == 0x0964 /* DEVANAGARI DANDA */
6454 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6455 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6456 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6457 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6458 || ch == 0x104B /* MYANMAR SIGN SECTION */
6459 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6460 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6461 || ch == 0x17D4 /* KHMER SIGN KHAN */
6462 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6463 || ch == 0x1B5E /* BALINESE CARIK SIKI */
6464 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6465 || ch == 0xA8CE /* SAURASHTRA DANDA */
6466 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6467 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6468 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6469 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6470 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6471 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6473 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6474 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6475 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6476 || ch == 0x0FBE /* TIBETAN KU RU KHA */
6477 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6478 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6479 /* Other Terminating Punctuation */
6480 || ch == 0x1804 /* MONGOLIAN COLON */
6481 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6482 || ch == 0x1B5A /* BALINESE PANTI */
6483 || ch == 0x1B5B /* BALINESE PAMADA */
6484 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6485 || ch == 0x1B60 /* BALINESE PAMENENG */
6486 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6487 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6488 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6489 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6490 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6491 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6492 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6493 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6494 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6495 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6496 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6497 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6498 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6499 || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
6500 || ch == 0x2E3D /* VERTICAL SIX DOTS */
6501 || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
6502 || ch == 0x2E40 /* DOUBLE HYPHEN */
6503 || ch == 0x2E41 /* REVERSED COMMA */
6504 || ch == 0xA60D /* VAI COMMA */
6505 || ch == 0xA60F /* VAI QUESTION MARK */
6506 || ch == 0xA92E /* KAYAH LI SIGN CWI */
6507 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6508 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6509 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6510 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6511 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6512 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6513 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6514 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6515 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6516 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6517 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6518 || ch == 0xA6F3 /* BAMUM FULL STOP */
6519 || ch == 0xA6F4 /* BAMUM COLON */
6520 || ch == 0xA6F5 /* BAMUM COMMA */
6521 || ch == 0xA6F6 /* BAMUM SEMICOLON */
6522 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6523 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6524 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6525 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6526 || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
6527 || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
6528 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6529 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6530 || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
6531 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6532 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6533 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6534 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6535 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6536 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6537 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6538 || ch == 0x11047 /* BRAHMI DANDA */
6539 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6540 || ch == 0x110BE /* KAITHI SECTION MARK */
6541 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6542 || ch == 0x110C0 /* KAITHI DANDA */
6543 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6544 || ch == 0x11140 /* CHAKMA SECTION MARK */
6545 || ch == 0x11141 /* CHAKMA DANDA */
6546 || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
6547 || ch == 0x11143 /* CHAKMA QUESTION MARK */
6548 || ch == 0x111C5 /* SHARADA DANDA */
6549 || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
6550 || ch == 0x111C8 /* SHARADA SEPARATOR */
6551 || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
6552 || ch == 0x11238 /* KHOJKI DANDA */
6553 || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
6554 || ch == 0x1123B /* KHOJKI SECTION MARK */
6555 || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
6556 || ch == 0x112A9 /* MULTANI SECTION MARK */
6557 || ch == 0x115C2 /* SIDDHAM DANDA */
6558 || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
6559 || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
6560 || ch == 0x11641 /* MODI DANDA */
6561 || ch == 0x11642 /* MODI DOUBLE DANDA */
6562 || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
6563 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6564 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6565 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
6566 || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
6567 || ch == 0x16A6E /* MRO DANDA */
6568 || ch == 0x16A6F /* MRO DOUBLE DANDA */
6569 || ch == 0x16AF5 /* BASSA VAH FULL STOP */
6570 || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
6571 || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
6572 || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
6573 || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
6574 || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
6575 || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
6576 attr |= (int64_t) 1 << LBP_BA;
6578 /* break opportunity before */
6579 if (ch == 0x00B4 /* ACUTE ACCENT */
6580 || ch == 0x1FFD /* GREEK OXIA */
6581 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6582 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6583 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6584 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6585 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6586 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6587 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6588 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6589 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6590 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6591 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6592 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6593 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6594 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6595 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6596 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6597 || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
6598 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
6599 || ch == 0x11175 /* MAHAJANI SECTION MARK */
6600 || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
6601 || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */)
6602 attr |= (int64_t) 1 << LBP_BB;
6605 if (ch == 0x002D /* HYPHEN-MINUS */)
6606 attr |= (int64_t) 1 << LBP_HY;
6608 /* contingent break opportunity */
6609 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6610 attr |= (int64_t) 1 << LBP_CB;
6612 /* closing parenthesis */
6613 if (ch == 0x0029 /* RIGHT PARENTHESIS */
6614 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6615 attr |= (int64_t) 1 << LBP_CP;
6617 /* closing punctuation */
6618 if ((unicode_attributes[ch].category[0] == 'P'
6619 && unicode_attributes[ch].category[1] == 'e'
6620 && !(attr & ((int64_t) 1 << LBP_CP)))
6621 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6622 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6623 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6624 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6625 || ch == 0xFE50 /* SMALL COMMA */
6626 || ch == 0xFE52 /* SMALL FULL STOP */
6627 || ch == 0xFF0C /* FULLWIDTH COMMA */
6628 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6629 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6630 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6631 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6632 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6633 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6634 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6635 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6636 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6637 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6638 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6639 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
6640 || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
6641 attr |= (int64_t) 1 << LBP_CL;
6643 /* exclamation/interrogation */
6644 if (ch == 0x0021 /* EXCLAMATION MARK */
6645 || ch == 0x003F /* QUESTION MARK */
6646 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6647 || ch == 0x061B /* ARABIC SEMICOLON */
6648 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6649 || ch == 0x061F /* ARABIC QUESTION MARK */
6650 || ch == 0x06D4 /* ARABIC FULL STOP */
6651 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6652 || ch == 0x0F0D /* TIBETAN MARK SHAD */
6653 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6654 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6655 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6656 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6657 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6658 || ch == 0x1802 /* MONGOLIAN COMMA */
6659 || ch == 0x1803 /* MONGOLIAN FULL STOP */
6660 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6661 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6662 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6663 || ch == 0x1945 /* LIMBU QUESTION MARK */
6664 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6665 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6666 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6667 || ch == 0x2CFE /* COPTIC FULL STOP */
6668 || ch == 0x2E2E /* REVERSED QUESTION MARK */
6669 || ch == 0xA60E /* VAI FULL STOP */
6670 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6671 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6672 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6673 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6674 || ch == 0xFE56 /* SMALL QUESTION MARK */
6675 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6676 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6677 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
6678 || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
6679 || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */)
6680 attr |= (int64_t) 1 << LBP_EX;
6683 if (ch == 0x2024 /* ONE DOT LEADER */
6684 || ch == 0x2025 /* TWO DOT LEADER */
6685 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6686 || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
6687 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
6688 || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
6689 attr |= (int64_t) 1 << LBP_IN;
6692 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6693 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6694 || ch == 0x203D /* INTERROBANG */
6695 || ch == 0x2047 /* DOUBLE QUESTION MARK */
6696 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6697 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6698 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6699 || ch == 0x301C /* WAVE DASH */
6700 || ch == 0x303C /* MASU MARK */
6701 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6702 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6703 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6704 || ch == 0x309D /* HIRAGANA ITERATION MARK */
6705 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6706 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6707 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6708 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6709 || ch == 0x30FD /* KATAKANA ITERATION MARK */
6710 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6711 || ch == 0xA015 /* YI SYLLABLE WU */
6712 || ch == 0xFE54 /* SMALL SEMICOLON */
6713 || ch == 0xFE55 /* SMALL COLON */
6714 || ch == 0xFF1A /* FULLWIDTH COLON */
6715 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6716 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6717 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6718 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6719 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6720 || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
6721 || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
6722 || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
6723 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6724 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6725 attr |= (int64_t) 1 << LBP_NS;
6727 /* opening punctuation */
6728 if ((unicode_attributes[ch].category[0] == 'P'
6729 && unicode_attributes[ch].category[1] == 's')
6730 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6731 || ch == 0x00BF /* INVERTED QUESTION MARK */
6732 || ch == 0x2E18 /* INVERTED INTERROBANG */
6733 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6734 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6735 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6736 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6737 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6738 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6739 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
6740 || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */)
6741 attr |= (int64_t) 1 << LBP_OP;
6743 /* ambiguous quotation */
6744 if ((unicode_attributes[ch].category[0] == 'P'
6745 && (unicode_attributes[ch].category[1] == 'f'
6746 || unicode_attributes[ch].category[1] == 'i'))
6747 || ch == 0x0022 /* QUOTATION MARK */
6748 || ch == 0x0027 /* APOSTROPHE */
6749 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6750 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6751 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6752 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6753 || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
6754 || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
6755 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6756 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6757 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6758 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6759 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6760 || ch == 0x2E0B /* RAISED SQUARE */
6761 || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6762 || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6763 || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
6764 attr |= (int64_t) 1 << LBP_QU;
6766 /* infix separator (numeric) */
6767 if (ch == 0x002C /* COMMA */
6768 || ch == 0x002E /* FULL STOP */
6769 || ch == 0x003A /* COLON */
6770 || ch == 0x003B /* SEMICOLON */
6771 || ch == 0x037E /* GREEK QUESTION MARK */
6772 || ch == 0x0589 /* ARMENIAN FULL STOP */
6773 || ch == 0x060C /* ARABIC COMMA */
6774 || ch == 0x060D /* ARABIC DATE SEPARATOR */
6775 || ch == 0x07F8 /* NKO COMMA */
6776 || ch == 0x2044 /* FRACTION SLASH */
6777 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6778 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6779 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6780 attr |= (int64_t) 1 << LBP_IS;
6783 if ((unicode_attributes[ch].category[0] == 'N'
6784 && unicode_attributes[ch].category[1] == 'd'
6785 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
6786 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
6787 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6788 attr |= (int64_t) 1 << LBP_NU;
6790 /* postfix (numeric) */
6791 if (ch == 0x0025 /* PERCENT SIGN */
6792 || ch == 0x00A2 /* CENT SIGN */
6793 || ch == 0x00B0 /* DEGREE SIGN */
6794 || ch == 0x060B /* AFGHANI SIGN */
6795 || ch == 0x066A /* ARABIC PERCENT SIGN */
6796 || ch == 0x2030 /* PER MILLE SIGN */
6797 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
6798 || ch == 0x2032 /* PRIME */
6799 || ch == 0x2033 /* DOUBLE PRIME */
6800 || ch == 0x2034 /* TRIPLE PRIME */
6801 || ch == 0x2035 /* REVERSED PRIME */
6802 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
6803 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
6804 || ch == 0x20A7 /* PESETA SIGN */
6805 || ch == 0x20BB /* NORDIC MARK SIGN */
6806 || ch == 0x2103 /* DEGREE CELSIUS */
6807 || ch == 0x2109 /* DEGREE FAHRENHEIT */
6808 || ch == 0xFDFC /* RIAL SIGN */
6809 || ch == 0xFE6A /* SMALL PERCENT SIGN */
6810 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
6811 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6812 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6813 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6814 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6815 || ch == 0x09F2 /* BENGALI RUPEE MARK */
6816 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
6817 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6818 || ch == 0x0D79 /* MALAYALAM DATE MARK */
6819 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
6820 || ch == 0x20BE /* LARI SIGN */
6821 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
6822 attr |= (int64_t) 1 << LBP_PO;
6824 /* prefix (numeric) */
6825 if ((unicode_attributes[ch].category[0] == 'S'
6826 && unicode_attributes[ch].category[1] == 'c')
6827 || ch == 0x002B /* PLUS SIGN */
6828 || ch == 0x005C /* REVERSE SOLIDUS */
6829 || ch == 0x00B1 /* PLUS-MINUS SIGN */
6830 || ch == 0x2116 /* NUMERO SIGN */
6831 || ch == 0x2212 /* MINUS SIGN */
6832 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
6833 if (!(attr & ((int64_t) 1 << LBP_PO)))
6834 attr |= (int64_t) 1 << LBP_PR;
6836 /* symbols allowing breaks */
6837 if (ch == 0x002F /* SOLIDUS */)
6838 attr |= (int64_t) 1 << LBP_SY;
6840 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
6841 attr |= (int64_t) 1 << LBP_H2;
6843 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
6844 attr |= (int64_t) 1 << LBP_H3;
6846 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
6847 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
6848 attr |= (int64_t) 1 << LBP_HL;
6850 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
6851 attr |= (int64_t) 1 << LBP_JL;
6853 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
6854 attr |= (int64_t) 1 << LBP_JV;
6856 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
6857 attr |= (int64_t) 1 << LBP_JT;
6859 /* regional indicator */
6860 if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
6861 attr |= (int64_t) 1 << LBP_RI;
6863 /* complex context (South East Asian) */
6864 if (((unicode_attributes[ch].category[0] == 'C'
6865 && unicode_attributes[ch].category[1] == 'f')
6866 || (unicode_attributes[ch].category[0] == 'L'
6867 && (unicode_attributes[ch].category[1] == 'm'
6868 || unicode_attributes[ch].category[1] == 'o'))
6869 || (unicode_attributes[ch].category[0] == 'M'
6870 && (unicode_attributes[ch].category[1] == 'c'
6871 || unicode_attributes[ch].category[1] == 'n')
6872 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6873 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6874 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
6875 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6876 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6877 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6878 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6879 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6880 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6881 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6882 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6883 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
6884 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6885 || ch == 0x1173F /* Ahom */)
6886 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6887 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6888 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6889 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6890 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6891 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6892 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6893 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
6894 || (ch >= 0x11700 && ch <= 0x11719) /* Ahom */
6895 || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
6896 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6897 || ch == 0x1173F /* Ahom */))
6898 attr |= (int64_t) 1 << LBP_SA;
6900 /* attached characters and combining marks */
6901 if ((unicode_attributes[ch].category[0] == 'M'
6902 && (unicode_attributes[ch].category[1] == 'c'
6903 || unicode_attributes[ch].category[1] == 'e'
6904 || unicode_attributes[ch].category[1] == 'n'))
6905 || (unicode_attributes[ch].category[0] == 'C'
6906 && (unicode_attributes[ch].category[1] == 'c'
6907 || unicode_attributes[ch].category[1] == 'f')
6908 && ch != 0x110BD /* KAITHI NUMBER SIGN */)
6909 || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
6910 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6911 attr |= (int64_t) 1 << LBP_CM;
6914 if (ch == 0x231A /* WATCH */
6915 || ch == 0x231B /* HOURGLASS */
6916 || ch == 0x23F0 /* ALARM CLOCK */
6917 || ch == 0x23F1 /* STOPWATCH */
6918 || ch == 0x23F2 /* TIMER CLOCK */
6919 || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
6920 || ch == 0x2600 /* BLACK SUN WITH RAYS */
6921 || ch == 0x2601 /* CLOUD */
6922 || ch == 0x2602 /* UMBRELLA */
6923 || ch == 0x2603 /* SNOWMAN */
6924 || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
6925 || ch == 0x2615 /* HOT BEVERAGE */
6926 || ch == 0x2618 /* SHAMROCK */
6927 || ch == 0x261A /* BLACK LEFT POINTING INDEX */
6928 || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
6929 || ch == 0x261C /* WHITE LEFT POINTING INDEX */
6930 || ch == 0x261D /* WHITE UP POINTING INDEX */
6931 || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
6932 || ch == 0x261F /* WHITE DOWN POINTING INDEX */
6933 || ch == 0x2639 /* WHITE FROWNING FACE */
6934 || ch == 0x263A /* WHITE SMILING FACE */
6935 || ch == 0x263B /* BLACK SMILING FACE */
6936 || ch == 0x2668 /* HOT SPRINGS */
6937 || ch == 0x267F /* WHEELCHAIR SYMBOL */
6938 || ch == 0x26BD /* SOCCER BALL */
6939 || ch == 0x26BE /* BASEBALL */
6940 || ch == 0x26BF /* SQUARED KEY */
6941 || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
6942 || ch == 0x26C1 /* WHITE DRAUGHTS KING */
6943 || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
6944 || ch == 0x26C3 /* BLACK DRAUGHTS KING */
6945 || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
6946 || ch == 0x26C5 /* SUN BEHIND CLOUD */
6947 || ch == 0x26C6 /* RAIN */
6948 || ch == 0x26C7 /* BLACK SNOWMAN */
6949 || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
6950 || ch == 0x26CD /* DISABLED CAR */
6951 || ch == 0x26CF /* PICK */
6952 || ch == 0x26D0 /* CAR SLIDING */
6953 || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
6954 || ch == 0x26D3 /* CHAINS */
6955 || ch == 0x26D4 /* NO ENTRY */
6956 || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
6957 || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
6958 || ch == 0x26DC /* LEFT CLOSED ENTRY */
6959 || ch == 0x26DF /* BLACK TRUCK */
6960 || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
6961 || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
6962 || ch == 0x26EA /* CHURCH */
6963 || ch == 0x26F1 /* UMBRELLA ON GROUND */
6964 || ch == 0x26F2 /* FOUNTAIN */
6965 || ch == 0x26F3 /* FLAG IN HOLE */
6966 || ch == 0x26F4 /* FERRY */
6967 || ch == 0x26F5 /* SAILBOAT */
6968 || ch == 0x26F7 /* SKIER */
6969 || ch == 0x26F8 /* ICE SKATE */
6970 || ch == 0x26F9 /* PERSON WITH BALL */
6971 || ch == 0x26FA /* TENT */
6972 || ch == 0x26FD /* FUEL PUMP */
6973 || ch == 0x26FE /* CUP ON BLACK SQUARE */
6974 || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
6975 || ch == 0x2700 /* BLACK SAFETY SCISSORS */
6976 || ch == 0x2701 /* UPPER BLADE SCISSORS */
6977 || ch == 0x2702 /* BLACK SCISSORS */
6978 || ch == 0x2703 /* LOWER BLADE SCISSORS */
6979 || ch == 0x2704 /* WHITE SCISSORS */
6980 || ch == 0x2708 /* AIRPLANE */
6981 || ch == 0x2709 /* ENVELOPE */
6982 || ch == 0x270A /* RAISED FIST */
6983 || ch == 0x270B /* RAISED HAND */
6984 || ch == 0x270C /* VICTORY HAND */
6985 || ch == 0x270D /* WRITING HAND */
6986 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6987 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6988 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6989 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6990 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6991 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6992 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6993 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6994 || ch == 0xFE62 /* SMALL PLUS SIGN */
6995 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6996 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6997 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6998 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6999 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
7000 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
7001 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
7002 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
7003 || (ch >= 0x3000 && ch <= 0x33FF
7004 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
7005 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7006 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
7007 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
7008 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
7009 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
7010 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
7011 || ch == 0xFE45 /* SESAME DOT */
7012 || ch == 0xFE46 /* WHITE SESAME DOT */
7013 || ch == 0xFE49 /* DASHED OVERLINE */
7014 || ch == 0xFE4A /* CENTRELINE OVERLINE */
7015 || ch == 0xFE4B /* WAVY OVERLINE */
7016 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
7017 || ch == 0xFE4D /* DASHED LOW LINE */
7018 || ch == 0xFE4E /* CENTRELINE LOW LINE */
7019 || ch == 0xFE4F /* WAVY LOW LINE */
7020 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
7021 || ch == 0xFE58 /* SMALL EM DASH */
7022 || ch == 0xFE5F /* SMALL NUMBER SIGN */
7023 || ch == 0xFE60 /* SMALL AMPERSAND */
7024 || ch == 0xFE61 /* SMALL ASTERISK */
7025 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
7026 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
7027 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
7028 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
7029 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
7030 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
7031 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
7032 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
7033 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
7034 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
7035 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
7036 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
7037 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
7038 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
7039 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
7040 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
7041 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
7042 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
7043 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
7044 || ch == 0xFF5E /* FULLWIDTH TILDE */
7045 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
7046 || ch == 0xFFE3 /* FULLWIDTH MACRON */
7047 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
7048 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7049 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
7050 || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
7051 || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
7052 || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
7053 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
7054 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
7055 || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
7056 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
7057 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
7058 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
7059 && !(ch >= 0x1F39C && ch <= 0x1F39D)
7060 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
7061 && !(ch >= 0x1F500 && ch <= 0x1F506)
7062 && !(ch >= 0x1F517 && ch <= 0x1F524)
7063 && !(ch >= 0x1F532 && ch <= 0x1F549)
7064 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
7065 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
7066 || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
7067 || (ch >= 0x1F680 && ch <= 0x1F6D0) /* Transport and Map Symbols */
7068 || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
7069 || (ch >= 0x1F6F0 && ch <= 0x1F6F3) /* Transport and Map Symbols */
7070 || (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
7071 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
7072 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
7073 || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
7074 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
7076 /* ambiguous (ideograph) ? */
7077 if ((unicode_width[ch] != NULL
7078 && unicode_width[ch][0] == 'A'
7087 && !(ch >= 0x26C4 && ch <= 0x26C8)
7101 && !(ch >= 0x26F1 && ch <= 0x26F5)
7102 && !(ch >= 0x26F7 && ch <= 0x26FA)
7103 && !(ch >= 0x26FD && ch <= 0x26FF))
7104 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7105 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
7106 attr |= (int64_t) 1 << LBP_AI;
7108 attr |= (int64_t) 1 << LBP_ID;
7111 /* ordinary alphabetic and symbol characters */
7112 if ((unicode_attributes[ch].category[0] == 'L'
7113 && (unicode_attributes[ch].category[1] == 'u'
7114 || unicode_attributes[ch].category[1] == 'l'
7115 || unicode_attributes[ch].category[1] == 't'
7116 || unicode_attributes[ch].category[1] == 'm'
7117 || unicode_attributes[ch].category[1] == 'o'))
7118 || (unicode_attributes[ch].category[0] == 'S'
7119 && (unicode_attributes[ch].category[1] == 'm'
7120 || unicode_attributes[ch].category[1] == 'k'
7121 || unicode_attributes[ch].category[1] == 'o'))
7122 || (unicode_attributes[ch].category[0] == 'N'
7123 && (unicode_attributes[ch].category[1] == 'l'
7124 || unicode_attributes[ch].category[1] == 'o'))
7125 || (unicode_attributes[ch].category[0] == 'P'
7126 && (unicode_attributes[ch].category[1] == 'c'
7127 || unicode_attributes[ch].category[1] == 'd'
7128 || unicode_attributes[ch].category[1] == 'o'))
7129 || ch == 0x0600 /* ARABIC NUMBER SIGN */
7130 || ch == 0x0601 /* ARABIC SIGN SANAH */
7131 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7132 || ch == 0x0603 /* ARABIC SIGN SAFHA */
7133 || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7134 || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7135 || ch == 0x06DD /* ARABIC END OF AYAH */
7136 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
7137 || ch == 0x2061 /* FUNCTION APPLICATION */
7138 || ch == 0x2062 /* INVISIBLE TIMES */
7139 || ch == 0x2063 /* INVISIBLE SEPARATOR */
7140 || ch == 0x2064 /* INVISIBLE PLUS */
7141 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7142 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
7143 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))
7144 && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7146 /* ambiguous (alphabetic) ? */
7147 if ((unicode_width[ch] != NULL
7148 && unicode_width[ch][0] == 'A'
7150 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
7151 && ch != 0x2022 /* BULLET */
7152 && ch != 0x203E /* OVERLINE */
7153 && ch != 0x2126 /* OHM SIGN */
7154 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
7155 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
7156 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
7157 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
7158 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
7159 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
7160 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
7161 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
7162 || ch == 0x00A7 /* SECTION SIGN */
7163 || ch == 0x00A8 /* DIAERESIS */
7164 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
7165 || ch == 0x00B2 /* SUPERSCRIPT TWO */
7166 || ch == 0x00B3 /* SUPERSCRIPT THREE */
7167 || ch == 0x00B6 /* PILCROW SIGN */
7168 || ch == 0x00B7 /* MIDDLE DOT */
7169 || ch == 0x00B8 /* CEDILLA */
7170 || ch == 0x00B9 /* SUPERSCRIPT ONE */
7171 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
7172 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
7173 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
7174 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
7175 || ch == 0x00D7 /* MULTIPLICATION SIGN */
7176 || ch == 0x00F7 /* DIVISION SIGN */
7177 || ch == 0x02C7 /* CARON */
7178 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
7179 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
7180 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
7181 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
7182 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
7183 || ch == 0x02D8 /* BREVE */
7184 || ch == 0x02D9 /* DOT ABOVE */
7185 || ch == 0x02DA /* RING ABOVE */
7186 || ch == 0x02DB /* OGONEK */
7187 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
7188 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7189 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
7190 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7191 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
7192 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
7193 || ch == 0x2616 /* WHITE SHOGI PIECE */
7194 || ch == 0x2617 /* BLACK SHOGI PIECE */
7195 || ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
7196 || ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
7197 attr |= (int64_t) 1 << LBP_AI;
7199 attr |= (int64_t) 1 << LBP_AL;
7200 attr &= ~((int64_t) 1 << LBP_CM);
7205 /* Unassigned character. */
7206 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
7207 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
7208 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
7209 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
7210 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
7211 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7212 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
7213 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7214 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
7215 attr |= (int64_t) 1 << LBP_ID;
7220 attr |= (int64_t) 1 << LBP_XX;
7225 /* Output the line breaking properties in a human readable format. */
7227 debug_output_lbp (FILE *stream)
7231 for (i = 0; i < 0x110000; i++)
7233 int64_t attr = get_lbp (i);
7234 if (attr != (int64_t) 1 << LBP_XX)
7236 fprintf (stream, "0x%04X", i);
7237 #define PRINT_BIT(attr,bit) \
7238 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7239 PRINT_BIT(attr,LBP_BK);
7240 PRINT_BIT(attr,LBP_CM);
7241 PRINT_BIT(attr,LBP_WJ);
7242 PRINT_BIT(attr,LBP_ZW);
7243 PRINT_BIT(attr,LBP_GL);
7244 PRINT_BIT(attr,LBP_SP);
7245 PRINT_BIT(attr,LBP_B2);
7246 PRINT_BIT(attr,LBP_BA);
7247 PRINT_BIT(attr,LBP_BB);
7248 PRINT_BIT(attr,LBP_HY);
7249 PRINT_BIT(attr,LBP_CB);
7250 PRINT_BIT(attr,LBP_CL);
7251 PRINT_BIT(attr,LBP_CP);
7252 PRINT_BIT(attr,LBP_EX);
7253 PRINT_BIT(attr,LBP_IN);
7254 PRINT_BIT(attr,LBP_NS);
7255 PRINT_BIT(attr,LBP_OP);
7256 PRINT_BIT(attr,LBP_QU);
7257 PRINT_BIT(attr,LBP_IS);
7258 PRINT_BIT(attr,LBP_NU);
7259 PRINT_BIT(attr,LBP_PO);
7260 PRINT_BIT(attr,LBP_PR);
7261 PRINT_BIT(attr,LBP_SY);
7262 PRINT_BIT(attr,LBP_AI);
7263 PRINT_BIT(attr,LBP_AL);
7264 PRINT_BIT(attr,LBP_H2);
7265 PRINT_BIT(attr,LBP_H3);
7266 PRINT_BIT(attr,LBP_HL);
7267 PRINT_BIT(attr,LBP_ID);
7268 PRINT_BIT(attr,LBP_JL);
7269 PRINT_BIT(attr,LBP_JV);
7270 PRINT_BIT(attr,LBP_JT);
7271 PRINT_BIT(attr,LBP_RI);
7272 PRINT_BIT(attr,LBP_SA);
7273 PRINT_BIT(attr,LBP_XX);
7275 fprintf (stream, "\n");
7281 debug_output_lbrk_tables (const char *filename)
7285 stream = fopen (filename, "w");
7288 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7292 debug_output_lbp (stream);
7294 if (ferror (stream) || fclose (stream))
7296 fprintf (stderr, "error writing to '%s'\n", filename);
7301 /* The line breaking property from the LineBreak.txt file. */
7302 int unicode_org_lbp[0x110000];
7304 /* Stores in unicode_org_lbp[] the line breaking property from the
7305 LineBreak.txt file. */
7307 fill_org_lbp (const char *linebreak_filename)
7311 char field0[FIELDLEN];
7312 char field1[FIELDLEN];
7313 char field2[FIELDLEN];
7316 for (i = 0; i < 0x110000; i++)
7317 unicode_org_lbp[i] = LBP_XX;
7319 stream = fopen (linebreak_filename, "r");
7322 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
7338 do c = getc (stream); while (c != EOF && c != '\n');
7342 n = getfield (stream, field0, ';');
7343 n += getfield (stream, field1, ' ');
7344 n += getfield (stream, field2, '\n');
7349 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
7353 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7391 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7392 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7393 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7394 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7395 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
7398 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7399 field1, linebreak_filename, lineno);
7402 i = strtoul (field0, NULL, 16);
7403 if (strstr (field0, "..") != NULL)
7405 /* Deal with a range. */
7406 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7408 unicode_org_lbp[i] = value;
7412 /* Single character line. */
7413 unicode_org_lbp[i] = value;
7417 if (ferror (stream) || fclose (stream))
7419 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7424 /* Output the line breaking properties in a human readable format. */
7426 debug_output_org_lbp (FILE *stream)
7430 for (i = 0; i < 0x110000; i++)
7432 int attr = unicode_org_lbp[i];
7435 fprintf (stream, "0x%04X", i);
7436 #define PRINT_BIT(attr,bit) \
7437 if (attr == bit) fprintf (stream, " " #bit);
7438 PRINT_BIT(attr,LBP_BK);
7439 PRINT_BIT(attr,LBP_CM);
7440 PRINT_BIT(attr,LBP_WJ);
7441 PRINT_BIT(attr,LBP_ZW);
7442 PRINT_BIT(attr,LBP_GL);
7443 PRINT_BIT(attr,LBP_SP);
7444 PRINT_BIT(attr,LBP_B2);
7445 PRINT_BIT(attr,LBP_BA);
7446 PRINT_BIT(attr,LBP_BB);
7447 PRINT_BIT(attr,LBP_HY);
7448 PRINT_BIT(attr,LBP_CB);
7449 PRINT_BIT(attr,LBP_CL);
7450 PRINT_BIT(attr,LBP_CP);
7451 PRINT_BIT(attr,LBP_EX);
7452 PRINT_BIT(attr,LBP_IN);
7453 PRINT_BIT(attr,LBP_NS);
7454 PRINT_BIT(attr,LBP_OP);
7455 PRINT_BIT(attr,LBP_QU);
7456 PRINT_BIT(attr,LBP_IS);
7457 PRINT_BIT(attr,LBP_NU);
7458 PRINT_BIT(attr,LBP_PO);
7459 PRINT_BIT(attr,LBP_PR);
7460 PRINT_BIT(attr,LBP_SY);
7461 PRINT_BIT(attr,LBP_AI);
7462 PRINT_BIT(attr,LBP_AL);
7463 PRINT_BIT(attr,LBP_H2);
7464 PRINT_BIT(attr,LBP_H3);
7465 PRINT_BIT(attr,LBP_HL);
7466 PRINT_BIT(attr,LBP_ID);
7467 PRINT_BIT(attr,LBP_JL);
7468 PRINT_BIT(attr,LBP_JV);
7469 PRINT_BIT(attr,LBP_JT);
7470 PRINT_BIT(attr,LBP_RI);
7471 PRINT_BIT(attr,LBP_SA);
7472 PRINT_BIT(attr,LBP_XX);
7474 fprintf (stream, "\n");
7480 debug_output_org_lbrk_tables (const char *filename)
7484 stream = fopen (filename, "w");
7487 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7491 debug_output_org_lbp (stream);
7493 if (ferror (stream) || fclose (stream))
7495 fprintf (stderr, "error writing to '%s'\n", filename);
7500 /* Construction of sparse 3-level tables. */
7501 #define TABLE lbp_table
7502 #define ELEMENT unsigned char
7503 #define DEFAULT LBP_XX
7504 #define xmalloc malloc
7505 #define xrealloc realloc
7509 output_lbp (FILE *stream1, FILE *stream2)
7513 unsigned int level1_offset, level2_offset, level3_offset;
7517 lbp_table_init (&t);
7519 for (i = 0; i < 0x110000; i++)
7521 int64_t attr = get_lbp (i);
7523 /* Now attr should contain exactly one bit. */
7524 assert (attr != 0 && (attr & (attr - 1)) == 0);
7526 if (attr != (int64_t) 1 << LBP_XX)
7528 unsigned int log2_attr;
7529 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7531 lbp_table_add (&t, i, log2_attr);
7535 lbp_table_finalize (&t);
7538 5 * sizeof (uint32_t);
7540 5 * sizeof (uint32_t)
7541 + t.level1_size * sizeof (uint32_t);
7543 5 * sizeof (uint32_t)
7544 + t.level1_size * sizeof (uint32_t)
7545 + (t.level2_size << t.q) * sizeof (uint32_t);
7547 for (i = 0; i < 5; i++)
7548 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7549 ((uint32_t *) t.result)[i]);
7550 fprintf (stream1, "\n");
7551 fprintf (stream1, "typedef struct\n");
7552 fprintf (stream1, " {\n");
7553 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7554 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7555 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7556 fprintf (stream1, " }\n");
7557 fprintf (stream1, "lbrkprop_t;\n");
7558 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7560 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7561 fprintf (stream2, "{\n");
7562 fprintf (stream2, " {");
7563 if (t.level1_size > 8)
7564 fprintf (stream2, "\n ");
7565 for (i = 0; i < t.level1_size; i++)
7568 if (i > 0 && (i % 8) == 0)
7569 fprintf (stream2, "\n ");
7570 offset = ((uint32_t *) (t.result + level1_offset))[i];
7572 fprintf (stream2, " %5d", -1);
7574 fprintf (stream2, " %5zu",
7575 (offset - level2_offset) / sizeof (uint32_t));
7576 if (i+1 < t.level1_size)
7577 fprintf (stream2, ",");
7579 if (t.level1_size > 8)
7580 fprintf (stream2, "\n ");
7581 fprintf (stream2, " },\n");
7582 fprintf (stream2, " {");
7583 if (t.level2_size << t.q > 8)
7584 fprintf (stream2, "\n ");
7585 for (i = 0; i < t.level2_size << t.q; i++)
7588 if (i > 0 && (i % 8) == 0)
7589 fprintf (stream2, "\n ");
7590 offset = ((uint32_t *) (t.result + level2_offset))[i];
7592 fprintf (stream2, " %5d", -1);
7594 fprintf (stream2, " %5zu",
7595 (offset - level3_offset) / sizeof (unsigned char));
7596 if (i+1 < t.level2_size << t.q)
7597 fprintf (stream2, ",");
7599 if (t.level2_size << t.q > 8)
7600 fprintf (stream2, "\n ");
7601 fprintf (stream2, " },\n");
7602 fprintf (stream2, " {");
7603 if (t.level3_size << t.p > 8)
7604 fprintf (stream2, "\n ");
7605 for (i = 0; i < t.level3_size << t.p; i++)
7607 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7608 const char *value_string;
7611 #define CASE(x) case x: value_string = #x; break;
7651 if (i > 0 && (i % 8) == 0)
7652 fprintf (stream2, "\n ");
7653 fprintf (stream2, " %s%s", value_string,
7654 (i+1 < t.level3_size << t.p ? "," : ""));
7656 if (t.level3_size << t.p > 8)
7657 fprintf (stream2, "\n ");
7658 fprintf (stream2, " }\n");
7659 fprintf (stream2, "};\n");
7663 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7665 const char *filenames[2];
7669 filenames[0] = filename1;
7670 filenames[1] = filename2;
7672 for (i = 0; i < 2; i++)
7674 streams[i] = fopen (filenames[i], "w");
7675 if (streams[i] == NULL)
7677 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7682 for (i = 0; i < 2; i++)
7684 FILE *stream = streams[i];
7686 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7687 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7688 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7690 fprintf (stream, "\n");
7692 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7693 still carries the GPL header), and it's gnulib-tool which replaces the
7694 GPL header with an LGPL header. */
7695 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7696 fprintf (stream, "\n");
7697 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7698 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7699 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7700 fprintf (stream, " (at your option) any later version.\n");
7701 fprintf (stream, "\n");
7702 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7703 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7704 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7705 fprintf (stream, " GNU General Public License for more details.\n");
7706 fprintf (stream, "\n");
7707 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7708 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7709 fprintf (stream, "\n");
7712 output_lbp (streams[0], streams[1]);
7714 for (i = 0; i < 2; i++)
7716 if (ferror (streams[i]) || fclose (streams[i]))
7718 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7724 /* ========================================================================= */
7726 /* Word break property.
7727 Updated for Unicode TR #29 revision 17. */
7729 /* Possible values of the Word_Break property. */
7744 WBP_EXTENDNUMLET = 7,
7751 /* Returns the word breaking property for ch, as a bit mask. */
7753 get_wbp (unsigned int ch)
7757 if (unicode_attributes[ch].name != NULL)
7760 attr |= 1 << WBP_CR;
7763 attr |= 1 << WBP_LF;
7765 if (ch == 0x000B || ch == 0x000C
7767 || ch == 0x2028 || ch == 0x2029)
7768 attr |= 1 << WBP_NEWLINE;
7770 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
7771 || (unicode_attributes[ch].category != NULL
7772 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
7773 attr |= 1 << WBP_EXTEND;
7775 if (unicode_attributes[ch].category != NULL
7776 && strcmp (unicode_attributes[ch].category, "Cf") == 0
7777 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
7778 attr |= 1 << WBP_FORMAT;
7780 if ((unicode_scripts[ch] < numscripts
7781 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
7782 || (ch >= 0x3031 && ch <= 0x3035)
7783 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
7785 attr |= 1 << WBP_KATAKANA;
7787 if ((unicode_scripts[ch] < numscripts
7788 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
7789 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
7790 attr |= 1 << WBP_HL;
7792 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
7794 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
7795 && (attr & (1 << WBP_KATAKANA)) == 0
7796 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
7797 && !(unicode_scripts[ch] < numscripts
7798 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
7799 && (attr & (1 << WBP_EXTEND)) == 0
7800 && (attr & (1 << WBP_HL)) == 0)
7801 attr |= 1 << WBP_ALETTER;
7803 if (is_WBP_MIDNUMLET (ch))
7804 attr |= 1 << WBP_MIDNUMLET;
7806 if (is_WBP_MIDLETTER (ch))
7807 attr |= 1 << WBP_MIDLETTER;
7809 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
7810 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
7812 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
7813 attr |= 1 << WBP_MIDNUM;
7815 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
7817 attr |= 1 << WBP_NUMERIC;
7819 if (unicode_attributes[ch].category != NULL
7820 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
7821 attr |= 1 << WBP_EXTENDNUMLET;
7823 if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
7824 attr |= 1 << WBP_RI;
7827 attr |= 1 << WBP_DQ;
7830 attr |= 1 << WBP_SQ;
7835 attr |= 1 << WBP_OTHER;
7840 /* Output the word break property in a human readable format. */
7842 debug_output_wbp (FILE *stream)
7846 for (i = 0; i < 0x110000; i++)
7848 int attr = get_wbp (i);
7849 if (attr != 1 << WBP_OTHER)
7851 fprintf (stream, "0x%04X", i);
7852 if (attr & (1 << WBP_CR))
7853 fprintf (stream, " CR");
7854 if (attr & (1 << WBP_LF))
7855 fprintf (stream, " LF");
7856 if (attr & (1 << WBP_NEWLINE))
7857 fprintf (stream, " Newline");
7858 if (attr & (1 << WBP_EXTEND))
7859 fprintf (stream, " Extend");
7860 if (attr & (1 << WBP_FORMAT))
7861 fprintf (stream, " Format");
7862 if (attr & (1 << WBP_KATAKANA))
7863 fprintf (stream, " Katakana");
7864 if (attr & (1 << WBP_ALETTER))
7865 fprintf (stream, " ALetter");
7866 if (attr & (1 << WBP_MIDNUMLET))
7867 fprintf (stream, " MidNumLet");
7868 if (attr & (1 << WBP_MIDLETTER))
7869 fprintf (stream, " MidLetter");
7870 if (attr & (1 << WBP_MIDNUM))
7871 fprintf (stream, " MidNum");
7872 if (attr & (1 << WBP_NUMERIC))
7873 fprintf (stream, " Numeric");
7874 if (attr & (1 << WBP_EXTENDNUMLET))
7875 fprintf (stream, " ExtendNumLet");
7876 if (attr & (1 << WBP_RI))
7877 fprintf (stream, " Regional_Indicator");
7878 if (attr & (1 << WBP_DQ))
7879 fprintf (stream, " Double_Quote");
7880 if (attr & (1 << WBP_SQ))
7881 fprintf (stream, " Single_Quote");
7882 if (attr & (1 << WBP_HL))
7883 fprintf (stream, " Hebrew_Letter");
7884 fprintf (stream, "\n");
7890 debug_output_wbrk_tables (const char *filename)
7894 stream = fopen (filename, "w");
7897 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7901 debug_output_wbp (stream);
7903 if (ferror (stream) || fclose (stream))
7905 fprintf (stderr, "error writing to '%s'\n", filename);
7910 /* The word break property from the WordBreakProperty.txt file. */
7911 int unicode_org_wbp[0x110000];
7913 /* Stores in unicode_org_wbp[] the word break property from the
7914 WordBreakProperty.txt file. */
7916 fill_org_wbp (const char *wordbreakproperty_filename)
7921 for (i = 0; i < 0x110000; i++)
7922 unicode_org_wbp[i] = WBP_OTHER;
7924 stream = fopen (wordbreakproperty_filename, "r");
7927 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
7934 unsigned int i1, i2;
7935 char padding[200+1];
7936 char propname[200+1];
7939 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7942 if (buf[0] == '\0' || buf[0] == '#')
7945 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7947 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7949 fprintf (stderr, "parse error in '%s'\n",
7950 wordbreakproperty_filename);
7955 #define PROP(name,value) \
7956 if (strcmp (propname, name) == 0) propvalue = value; else
7959 PROP ("Newline", WBP_NEWLINE)
7960 PROP ("Extend", WBP_EXTEND)
7961 PROP ("Format", WBP_FORMAT)
7962 PROP ("Katakana", WBP_KATAKANA)
7963 PROP ("ALetter", WBP_ALETTER)
7964 PROP ("MidNumLet", WBP_MIDNUMLET)
7965 PROP ("MidLetter", WBP_MIDLETTER)
7966 PROP ("MidNum", WBP_MIDNUM)
7967 PROP ("Numeric", WBP_NUMERIC)
7968 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7969 PROP ("Regional_Indicator", WBP_RI)
7970 PROP ("Double_Quote", WBP_DQ)
7971 PROP ("Single_Quote", WBP_SQ)
7972 PROP ("Hebrew_Letter", WBP_HL)
7975 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
7976 wordbreakproperty_filename);
7979 assert (i1 <= i2 && i2 < 0x110000);
7981 for (i = i1; i <= i2; i++)
7982 unicode_org_wbp[i] = propvalue;
7985 if (ferror (stream) || fclose (stream))
7987 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
7992 /* Output the word break property in a human readable format. */
7994 debug_output_org_wbp (FILE *stream)
7998 for (i = 0; i < 0x110000; i++)
8000 int propvalue = unicode_org_wbp[i];
8001 if (propvalue != WBP_OTHER)
8003 fprintf (stream, "0x%04X", i);
8004 #define PROP(name,value) \
8005 if (propvalue == value) fprintf (stream, " " name); else
8008 PROP ("Newline", WBP_NEWLINE)
8009 PROP ("Extend", WBP_EXTEND)
8010 PROP ("Format", WBP_FORMAT)
8011 PROP ("Katakana", WBP_KATAKANA)
8012 PROP ("ALetter", WBP_ALETTER)
8013 PROP ("MidNumLet", WBP_MIDNUMLET)
8014 PROP ("MidLetter", WBP_MIDLETTER)
8015 PROP ("MidNum", WBP_MIDNUM)
8016 PROP ("Numeric", WBP_NUMERIC)
8017 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8018 PROP ("Regional_Indicator", WBP_RI)
8019 PROP ("Double_Quote", WBP_DQ)
8020 PROP ("Single_Quote", WBP_SQ)
8021 PROP ("Hebrew_Letter", WBP_HL)
8023 fprintf (stream, " ??");
8024 fprintf (stream, "\n");
8030 debug_output_org_wbrk_tables (const char *filename)
8034 stream = fopen (filename, "w");
8037 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8041 debug_output_org_wbp (stream);
8043 if (ferror (stream) || fclose (stream))
8045 fprintf (stderr, "error writing to '%s'\n", filename);
8050 /* Construction of sparse 3-level tables. */
8051 #define TABLE wbp_table
8052 #define ELEMENT unsigned char
8053 #define DEFAULT WBP_OTHER
8054 #define xmalloc malloc
8055 #define xrealloc realloc
8059 output_wbp (FILE *stream)
8063 unsigned int level1_offset, level2_offset, level3_offset;
8067 wbp_table_init (&t);
8069 for (i = 0; i < 0x110000; i++)
8071 int attr = get_wbp (i);
8073 /* Now attr should contain exactly one bit. */
8074 assert (attr != 0 && (attr & (attr - 1)) == 0);
8076 if (attr != 1 << WBP_OTHER)
8078 unsigned int log2_attr;
8079 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8081 wbp_table_add (&t, i, log2_attr);
8085 wbp_table_finalize (&t);
8088 5 * sizeof (uint32_t);
8090 5 * sizeof (uint32_t)
8091 + t.level1_size * sizeof (uint32_t);
8093 5 * sizeof (uint32_t)
8094 + t.level1_size * sizeof (uint32_t)
8095 + (t.level2_size << t.q) * sizeof (uint32_t);
8097 for (i = 0; i < 5; i++)
8098 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
8099 ((uint32_t *) t.result)[i]);
8100 fprintf (stream, "\n");
8101 fprintf (stream, "typedef struct\n");
8102 fprintf (stream, " {\n");
8103 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8104 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
8105 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8106 fprintf (stream, " }\n");
8107 fprintf (stream, "wbrkprop_t;\n");
8108 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
8109 fprintf (stream, "{\n");
8110 fprintf (stream, " {");
8111 if (t.level1_size > 8)
8112 fprintf (stream, "\n ");
8113 for (i = 0; i < t.level1_size; i++)
8116 if (i > 0 && (i % 8) == 0)
8117 fprintf (stream, "\n ");
8118 offset = ((uint32_t *) (t.result + level1_offset))[i];
8120 fprintf (stream, " %5d", -1);
8122 fprintf (stream, " %5zu",
8123 (offset - level2_offset) / sizeof (uint32_t));
8124 if (i+1 < t.level1_size)
8125 fprintf (stream, ",");
8127 if (t.level1_size > 8)
8128 fprintf (stream, "\n ");
8129 fprintf (stream, " },\n");
8130 fprintf (stream, " {");
8131 if (t.level2_size << t.q > 8)
8132 fprintf (stream, "\n ");
8133 for (i = 0; i < t.level2_size << t.q; i++)
8136 if (i > 0 && (i % 8) == 0)
8137 fprintf (stream, "\n ");
8138 offset = ((uint32_t *) (t.result + level2_offset))[i];
8140 fprintf (stream, " %5d", -1);
8142 fprintf (stream, " %5zu",
8143 (offset - level3_offset) / sizeof (unsigned char));
8144 if (i+1 < t.level2_size << t.q)
8145 fprintf (stream, ",");
8147 if (t.level2_size << t.q > 8)
8148 fprintf (stream, "\n ");
8149 fprintf (stream, " },\n");
8150 fprintf (stream, " {");
8151 if (t.level3_size << t.p > 4)
8152 fprintf (stream, "\n ");
8153 for (i = 0; i < t.level3_size << t.p; i++)
8155 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8156 const char *value_string;
8159 #define CASE(x) case x: value_string = #x; break;
8168 CASE(WBP_MIDNUMLET);
8169 CASE(WBP_MIDLETTER);
8172 CASE(WBP_EXTENDNUMLET);
8181 if (i > 0 && (i % 4) == 0)
8182 fprintf (stream, "\n ");
8183 fprintf (stream, " %s%s", value_string,
8184 (i+1 < t.level3_size << t.p ? "," : ""));
8186 if (t.level3_size << t.p > 4)
8187 fprintf (stream, "\n ");
8188 fprintf (stream, " }\n");
8189 fprintf (stream, "};\n");
8193 output_wbrk_tables (const char *filename, const char *version)
8197 stream = fopen (filename, "w");
8200 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8204 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8205 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
8206 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8208 fprintf (stream, "\n");
8210 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8211 still carries the GPL header), and it's gnulib-tool which replaces the
8212 GPL header with an LGPL header. */
8213 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8214 fprintf (stream, "\n");
8215 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8216 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8217 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8218 fprintf (stream, " (at your option) any later version.\n");
8219 fprintf (stream, "\n");
8220 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8221 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8222 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8223 fprintf (stream, " GNU General Public License for more details.\n");
8224 fprintf (stream, "\n");
8225 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8226 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8227 fprintf (stream, "\n");
8229 output_wbp (stream);
8231 if (ferror (stream) || fclose (stream))
8233 fprintf (stderr, "error writing to '%s'\n", filename);
8238 /* ========================================================================= */
8240 /* Grapheme break property.
8241 Updated for Unicode TR #29 revision 17. */
8243 /* Possible values of the Grapheme_Cluster_Break property. */
8252 GBP_SPACINGMARK = 6,
8261 /* Construction of sparse 3-level tables. */
8262 #define TABLE gbp_table
8263 #define ELEMENT unsigned char
8264 #define DEFAULT GBP_OTHER
8265 #define xmalloc malloc
8266 #define xrealloc realloc
8269 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
8270 int unicode_org_gbp[0x110000];
8272 /* Output the unit test data for the grapheme break property. */
8274 output_gbp_test (const char *filename)
8280 stream = fopen (filename, "w");
8283 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8287 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8288 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
8289 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
8290 fprintf (stream, "\n");
8291 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8292 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8293 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8294 fprintf (stream, " (at your option) any later version.\n");
8295 fprintf (stream, "\n");
8296 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8297 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8298 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8299 fprintf (stream, " GNU General Public License for more details.\n");
8300 fprintf (stream, "\n");
8301 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8302 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8303 fprintf (stream, "\n");
8306 for (ch = 0; ch < 0x110000; ch++)
8308 int gbp = unicode_org_gbp[ch];
8309 const char *gbp_string;
8311 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
8316 #define CASE(x) case x: gbp_string = #x; break;
8323 CASE (GBP_SPACINGMARK)
8336 fprintf (stream, ",\n");
8337 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
8341 fprintf (stream, "\n");
8343 if (ferror (stream) || fclose (stream))
8345 fprintf (stderr, "error writing to '%s'\n", filename);
8350 /* Output the per-character grapheme break property table. */
8352 output_gbp_table (const char *filename, const char *version)
8357 unsigned int level1_offset, level2_offset, level3_offset;
8359 stream = fopen (filename, "w");
8362 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8366 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8367 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
8368 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8373 gbp_table_init (&t);
8375 for (ch = 0; ch < 0x110000; ch++)
8376 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
8378 gbp_table_finalize (&t);
8380 /* Offsets in t.result, in memory of this process. */
8382 5 * sizeof (uint32_t);
8384 5 * sizeof (uint32_t)
8385 + t.level1_size * sizeof (uint32_t);
8387 5 * sizeof (uint32_t)
8388 + t.level1_size * sizeof (uint32_t)
8389 + (t.level2_size << t.q) * sizeof (uint32_t);
8391 for (i = 0; i < 5; i++)
8392 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
8393 ((uint32_t *) t.result)[i]);
8394 fprintf (stream, "static const\n");
8395 fprintf (stream, "struct\n");
8396 fprintf (stream, " {\n");
8397 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8398 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8399 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
8400 t.level3_size, t.p);
8401 fprintf (stream, " }\n");
8402 fprintf (stream, "unigbrkprop =\n");
8403 fprintf (stream, "{\n");
8404 fprintf (stream, " {");
8405 if (t.level1_size > 8)
8406 fprintf (stream, "\n ");
8407 for (i = 0; i < t.level1_size; i++)
8410 if (i > 0 && (i % 8) == 0)
8411 fprintf (stream, "\n ");
8412 offset = ((uint32_t *) (t.result + level1_offset))[i];
8414 fprintf (stream, " %5d", -1);
8416 fprintf (stream, " %5zu",
8417 (offset - level2_offset) / sizeof (uint32_t));
8418 if (i+1 < t.level1_size)
8419 fprintf (stream, ",");
8421 if (t.level1_size > 8)
8422 fprintf (stream, "\n ");
8423 fprintf (stream, " },\n");
8424 fprintf (stream, " {");
8425 if (t.level2_size << t.q > 8)
8426 fprintf (stream, "\n ");
8427 for (i = 0; i < t.level2_size << t.q; i++)
8430 if (i > 0 && (i % 8) == 0)
8431 fprintf (stream, "\n ");
8432 offset = ((uint32_t *) (t.result + level2_offset))[i];
8434 fprintf (stream, " %5d", -1);
8436 fprintf (stream, " %5zu",
8437 (offset - level3_offset) / sizeof (uint8_t) / 2);
8438 if (i+1 < t.level2_size << t.q)
8439 fprintf (stream, ",");
8441 if (t.level2_size << t.q > 8)
8442 fprintf (stream, "\n ");
8443 fprintf (stream, " },\n");
8444 fprintf (stream, " {");
8445 if (t.level3_size << t.p > 8)
8446 fprintf (stream, "\n ");
8447 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
8449 unsigned char *p = (unsigned char *) (t.result + level3_offset);
8450 unsigned char value0 = p[i * 2];
8451 unsigned char value1 = p[i * 2 + 1];
8452 if (i > 0 && (i % 8) == 0)
8453 fprintf (stream, "\n ");
8454 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
8455 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
8457 if (t.level3_size << t.p > 8)
8458 fprintf (stream, "\n ");
8459 fprintf (stream, " }\n");
8460 fprintf (stream, "};\n");
8462 if (ferror (stream) || fclose (stream))
8464 fprintf (stderr, "error writing to '%s'\n", filename);
8469 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8470 GraphemeBreakProperty.txt file. */
8472 fill_org_gbp (const char *graphemebreakproperty_filename)
8478 for (i = 0; i < 0x110000; i++)
8479 unicode_org_gbp[i] = GBP_OTHER;
8481 stream = fopen (graphemebreakproperty_filename, "r");
8484 fprintf (stderr, "error during fopen of '%s'\n",
8485 graphemebreakproperty_filename);
8492 unsigned int i1, i2;
8493 char padding[200+1];
8494 char propname[200+1];
8498 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8501 if (buf[0] == '\0' || buf[0] == '#')
8504 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8506 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8508 fprintf (stderr, "parse error in '%s'\n",
8509 graphemebreakproperty_filename);
8514 #define PROP(name,value) \
8515 if (strcmp (propname, name) == 0) propvalue = value; else
8518 PROP ("Control", GBP_CONTROL)
8519 PROP ("Extend", GBP_EXTEND)
8520 PROP ("Prepend", GBP_PREPEND)
8521 PROP ("SpacingMark", GBP_SPACINGMARK)
8526 PROP ("LVT", GBP_LVT)
8527 PROP ("Regional_Indicator", GBP_RI)
8530 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8531 graphemebreakproperty_filename, lineno);
8534 assert (i1 <= i2 && i2 < 0x110000);
8536 for (i = i1; i <= i2; i++)
8537 unicode_org_gbp[i] = propvalue;
8540 if (ferror (stream) || fclose (stream))
8542 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8547 /* ========================================================================= */
8549 /* Composition and decomposition.
8550 Updated for Unicode TR #15 revision 33. */
8552 /* Maximum number of characters into which a single Unicode character can be
8554 #define MAX_DECOMP_LENGTH 18
8558 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
8559 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
8560 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
8561 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
8562 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
8563 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
8564 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
8565 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
8566 UC_DECOMP_SUPER, /* <super> A superscript form. */
8567 UC_DECOMP_SUB, /* <sub> A subscript form. */
8568 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
8569 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
8570 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
8571 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
8572 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
8573 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
8574 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
8577 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8578 decompositions). Return the type, or -1 for none. */
8580 get_decomposition (unsigned int ch,
8581 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8583 const char *decomposition = unicode_attributes[ch].decomposition;
8585 if (decomposition != NULL && decomposition[0] != '\0')
8587 int type = UC_DECOMP_CANONICAL;
8588 unsigned int length;
8591 if (decomposition[0] == '<')
8596 rangle = strchr (decomposition + 1, '>');
8597 assert (rangle != NULL);
8598 typelen = rangle + 1 - decomposition;
8599 #define TYPE(t1,t2) \
8600 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8603 TYPE ("<font>", UC_DECOMP_FONT)
8604 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8605 TYPE ("<initial>", UC_DECOMP_INITIAL)
8606 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8607 TYPE ("<final>", UC_DECOMP_FINAL)
8608 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8609 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8610 TYPE ("<super>", UC_DECOMP_SUPER)
8611 TYPE ("<sub>", UC_DECOMP_SUB)
8612 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8613 TYPE ("<wide>", UC_DECOMP_WIDE)
8614 TYPE ("<narrow>", UC_DECOMP_NARROW)
8615 TYPE ("<small>", UC_DECOMP_SMALL)
8616 TYPE ("<square>", UC_DECOMP_SQUARE)
8617 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8618 TYPE ("<compat>", UC_DECOMP_COMPAT)
8620 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8624 decomposition = rangle + 1;
8625 if (decomposition[0] == ' ')
8628 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8630 decomposed[length] = strtoul (decomposition, &endptr, 16);
8631 if (endptr == decomposition)
8633 decomposition = endptr;
8634 if (decomposition[0] == ' ')
8637 /* Make sure that *DECOMPOSITION is not NULL-terminated.
8638 Otherwise MAX_DECOMP_LENGTH is too small. */
8639 assert (*decomposition == '\0');
8648 /* Construction of sparse 3-level tables. */
8649 #define TABLE decomp_table
8650 #define ELEMENT uint16_t
8651 #define DEFAULT (uint16_t)(-1)
8652 #define xmalloc malloc
8653 #define xrealloc realloc
8657 output_decomposition (FILE *stream1, FILE *stream2)
8659 struct decomp_table t;
8660 unsigned int level1_offset, level2_offset, level3_offset;
8661 unsigned int offset;
8667 decomp_table_init (&t);
8669 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8670 fprintf (stream1, "\n");
8671 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8674 for (ch = 0; ch < 0x110000; ch++)
8676 unsigned int length;
8677 unsigned int decomposed[MAX_DECOMP_LENGTH];
8678 int type = get_decomposition (ch, &length, decomposed);
8682 assert (offset < (1 << 15));
8683 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
8685 /* Produce length 3-bytes entries. */
8686 /* We would need a special representation of zero-length entries. */
8687 assert (length != 0);
8688 for (i = 0; i < length; i++)
8691 fprintf (stream2, ",");
8692 if ((offset % 4) == 0)
8693 fprintf (stream2, "\n ");
8694 assert (decomposed[i] < (1 << 18));
8695 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
8696 (((i+1 < length ? (1 << 23) : 0)
8697 | (i == 0 ? (type << 18) : 0)
8698 | decomposed[i]) >> 16) & 0xff,
8699 (decomposed[i] >> 8) & 0xff,
8700 decomposed[i] & 0xff);
8706 fprintf (stream2, "\n};\n");
8707 fprintf (stream2, "\n");
8709 decomp_table_finalize (&t);
8712 5 * sizeof (uint32_t);
8714 5 * sizeof (uint32_t)
8715 + t.level1_size * sizeof (uint32_t);
8717 5 * sizeof (uint32_t)
8718 + t.level1_size * sizeof (uint32_t)
8719 + (t.level2_size << t.q) * sizeof (uint32_t);
8721 for (i = 0; i < 5; i++)
8722 fprintf (stream1, "#define decomp_header_%d %d\n", i,
8723 ((uint32_t *) t.result)[i]);
8724 fprintf (stream1, "\n");
8725 fprintf (stream1, "typedef struct\n");
8726 fprintf (stream1, " {\n");
8727 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8728 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8729 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
8730 fprintf (stream1, " }\n");
8731 fprintf (stream1, "decomp_index_table_t;\n");
8732 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8733 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8734 fprintf (stream2, "{\n");
8735 fprintf (stream2, " {");
8736 if (t.level1_size > 8)
8737 fprintf (stream2, "\n ");
8738 for (i = 0; i < t.level1_size; i++)
8741 if (i > 0 && (i % 8) == 0)
8742 fprintf (stream2, "\n ");
8743 offset = ((uint32_t *) (t.result + level1_offset))[i];
8745 fprintf (stream2, " %5d", -1);
8747 fprintf (stream2, " %5zu",
8748 (offset - level2_offset) / sizeof (uint32_t));
8749 if (i+1 < t.level1_size)
8750 fprintf (stream2, ",");
8752 if (t.level1_size > 8)
8753 fprintf (stream2, "\n ");
8754 fprintf (stream2, " },\n");
8755 fprintf (stream2, " {");
8756 if (t.level2_size << t.q > 8)
8757 fprintf (stream2, "\n ");
8758 for (i = 0; i < t.level2_size << t.q; i++)
8761 if (i > 0 && (i % 8) == 0)
8762 fprintf (stream2, "\n ");
8763 offset = ((uint32_t *) (t.result + level2_offset))[i];
8765 fprintf (stream2, " %5d", -1);
8767 fprintf (stream2, " %5zu",
8768 (offset - level3_offset) / sizeof (uint16_t));
8769 if (i+1 < t.level2_size << t.q)
8770 fprintf (stream2, ",");
8772 if (t.level2_size << t.q > 8)
8773 fprintf (stream2, "\n ");
8774 fprintf (stream2, " },\n");
8775 fprintf (stream2, " {");
8776 if (t.level3_size << t.p > 8)
8777 fprintf (stream2, "\n ");
8778 for (i = 0; i < t.level3_size << t.p; i++)
8780 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
8781 if (i > 0 && (i % 8) == 0)
8782 fprintf (stream2, "\n ");
8783 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
8784 if (i+1 < t.level3_size << t.p)
8785 fprintf (stream2, ",");
8787 if (t.level3_size << t.p > 8)
8788 fprintf (stream2, "\n ");
8789 fprintf (stream2, " }\n");
8790 fprintf (stream2, "};\n");
8794 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
8796 const char *filenames[2];
8800 filenames[0] = filename1;
8801 filenames[1] = filename2;
8803 for (i = 0; i < 2; i++)
8805 streams[i] = fopen (filenames[i], "w");
8806 if (streams[i] == NULL)
8808 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
8813 for (i = 0; i < 2; i++)
8815 FILE *stream = streams[i];
8817 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8818 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
8819 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8821 fprintf (stream, "\n");
8824 output_decomposition (streams[0], streams[1]);
8826 for (i = 0; i < 2; i++)
8828 if (ferror (streams[i]) || fclose (streams[i]))
8830 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
8836 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
8837 char unicode_composition_exclusions[0x110000];
8840 fill_composition_exclusions (const char *compositionexclusions_filename)
8845 stream = fopen (compositionexclusions_filename, "r");
8848 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
8852 for (i = 0; i < 0x110000; i++)
8853 unicode_composition_exclusions[i] = 0;
8860 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8863 if (buf[0] == '\0' || buf[0] == '#')
8866 if (sscanf (buf, "%X", &i) != 1)
8868 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
8871 assert (i < 0x110000);
8873 unicode_composition_exclusions[i] = 1;
8876 if (ferror (stream) || fclose (stream))
8878 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
8884 debug_output_composition_tables (const char *filename)
8889 stream = fopen (filename, "w");
8892 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8896 for (ch = 0; ch < 0x110000; ch++)
8898 unsigned int length;
8899 unsigned int decomposed[MAX_DECOMP_LENGTH];
8900 int type = get_decomposition (ch, &length, decomposed);
8902 if (type == UC_DECOMP_CANONICAL
8903 /* Consider only binary decompositions.
8904 Exclude singleton decompositions. */
8907 unsigned int code1 = decomposed[0];
8908 unsigned int code2 = decomposed[1];
8909 unsigned int combined = ch;
8911 /* Exclude decompositions where the first part is not a starter,
8912 i.e. is not of canonical combining class 0. */
8913 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8914 /* Exclude characters listed in CompositionExclusions.txt. */
8915 && !unicode_composition_exclusions[combined])
8917 /* The combined character must now also be a starter.
8919 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
8921 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
8925 unicode_attributes[code2].combining);
8930 if (ferror (stream) || fclose (stream))
8932 fprintf (stderr, "error writing to '%s'\n", filename);
8938 output_composition_tables (const char *filename, const char *version)
8943 stream = fopen (filename, "w");
8946 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8950 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8951 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
8952 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8954 fprintf (stream, "\n");
8956 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8957 still carries the GPL header), and it's gnulib-tool which replaces the
8958 GPL header with an LGPL header. */
8959 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
8960 fprintf (stream, "\n");
8961 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8962 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8963 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8964 fprintf (stream, " (at your option) any later version.\n");
8965 fprintf (stream, "\n");
8966 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8967 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8968 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8969 fprintf (stream, " GNU General Public License for more details.\n");
8970 fprintf (stream, "\n");
8971 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8972 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8973 fprintf (stream, "\n");
8975 /* The composition table is a set of mappings (code1, code2) -> combined,
8977 367 values for code1 (from 0x003C to 0x30FD),
8978 54 values for code2 (from 0x0300 to 0x309A).
8979 For a fixed code1, there are from 1 to 19 possible values for code2.
8980 For a fixed code2, there are from 1 to 117 possible values for code1.
8981 This is a very sparse matrix.
8983 We want an O(1) hash lookup.
8985 We could implement the hash lookup by mapping (code1, code2) to a linear
8986 combination mul1*code1 + mul2*code2, which is then used as an index into
8987 a 3-level table. But this leads to a table of size 37 KB.
8989 We use gperf to implement the hash lookup, giving it the 928 sets of
8990 4 bytes (code1, code2) as input. gperf generates a hash table of size
8991 1527, which is quite good (60% filled). It requires an auxiliary table
8992 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
8994 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
8995 fprintf (stream, "%%struct-type\n");
8996 fprintf (stream, "%%language=ANSI-C\n");
8997 fprintf (stream, "%%define slot-name codes\n");
8998 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
8999 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9000 fprintf (stream, "%%compare-lengths\n");
9001 fprintf (stream, "%%compare-strncmp\n");
9002 fprintf (stream, "%%readonly-tables\n");
9003 fprintf (stream, "%%omit-struct-type\n");
9004 fprintf (stream, "%%%%\n");
9006 for (ch = 0; ch < 0x110000; ch++)
9008 unsigned int length;
9009 unsigned int decomposed[MAX_DECOMP_LENGTH];
9010 int type = get_decomposition (ch, &length, decomposed);
9012 if (type == UC_DECOMP_CANONICAL
9013 /* Consider only binary decompositions.
9014 Exclude singleton decompositions. */
9017 unsigned int code1 = decomposed[0];
9018 unsigned int code2 = decomposed[1];
9019 unsigned int combined = ch;
9021 /* Exclude decompositions where the first part is not a starter,
9022 i.e. is not of canonical combining class 0. */
9023 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9024 /* Exclude characters listed in CompositionExclusions.txt. */
9025 && !unicode_composition_exclusions[combined])
9027 /* The combined character must now also be a starter.
9029 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9031 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9032 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
9033 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
9039 if (ferror (stream) || fclose (stream))
9041 fprintf (stderr, "error writing to '%s'\n", filename);
9046 /* ========================================================================= */
9048 /* Output the test for a simple character mapping table to the given file. */
9051 output_simple_mapping_test (const char *filename,
9052 const char *function_name,
9053 unsigned int (*func) (unsigned int),
9054 const char *version)
9060 stream = fopen (filename, "w");
9063 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9067 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9068 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
9069 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
9070 fprintf (stream, "\n");
9071 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
9072 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
9073 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
9074 fprintf (stream, " (at your option) any later version.\n");
9075 fprintf (stream, "\n");
9076 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
9077 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9078 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9079 fprintf (stream, " GNU General Public License for more details.\n");
9080 fprintf (stream, "\n");
9081 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
9082 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
9083 fprintf (stream, "\n");
9084 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9086 fprintf (stream, "\n");
9087 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
9088 fprintf (stream, "\n");
9091 for (ch = 0; ch < 0x110000; ch++)
9093 unsigned int value = func (ch);
9098 fprintf (stream, ",\n");
9099 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
9104 fprintf (stream, "\n");
9106 fprintf (stream, "\n");
9107 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
9108 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
9110 if (ferror (stream) || fclose (stream))
9112 fprintf (stderr, "error writing to '%s'\n", filename);
9117 /* Construction of sparse 3-level tables. */
9118 #define TABLE mapping_table
9119 #define ELEMENT int32_t
9121 #define xmalloc malloc
9122 #define xrealloc realloc
9125 /* Output a simple character mapping table to the given file. */
9128 output_simple_mapping (const char *filename,
9129 unsigned int (*func) (unsigned int),
9130 const char *version)
9134 struct mapping_table t;
9135 unsigned int level1_offset, level2_offset, level3_offset;
9137 stream = fopen (filename, "w");
9140 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9144 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9145 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
9146 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9151 mapping_table_init (&t);
9153 for (ch = 0; ch < 0x110000; ch++)
9155 int value = (int) func (ch) - (int) ch;
9157 mapping_table_add (&t, ch, value);
9160 mapping_table_finalize (&t);
9162 /* Offsets in t.result, in memory of this process. */
9164 5 * sizeof (uint32_t);
9166 5 * sizeof (uint32_t)
9167 + t.level1_size * sizeof (uint32_t);
9169 5 * sizeof (uint32_t)
9170 + t.level1_size * sizeof (uint32_t)
9171 + (t.level2_size << t.q) * sizeof (uint32_t);
9173 for (i = 0; i < 5; i++)
9174 fprintf (stream, "#define mapping_header_%d %d\n", i,
9175 ((uint32_t *) t.result)[i]);
9176 fprintf (stream, "static const\n");
9177 fprintf (stream, "struct\n");
9178 fprintf (stream, " {\n");
9179 fprintf (stream, " int level1[%zu];\n", t.level1_size);
9180 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
9181 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
9182 fprintf (stream, " }\n");
9183 fprintf (stream, "u_mapping =\n");
9184 fprintf (stream, "{\n");
9185 fprintf (stream, " {");
9186 if (t.level1_size > 8)
9187 fprintf (stream, "\n ");
9188 for (i = 0; i < t.level1_size; i++)
9191 if (i > 0 && (i % 8) == 0)
9192 fprintf (stream, "\n ");
9193 offset = ((uint32_t *) (t.result + level1_offset))[i];
9195 fprintf (stream, " %5d", -1);
9197 fprintf (stream, " %5zu",
9198 (offset - level2_offset) / sizeof (uint32_t));
9199 if (i+1 < t.level1_size)
9200 fprintf (stream, ",");
9202 if (t.level1_size > 8)
9203 fprintf (stream, "\n ");
9204 fprintf (stream, " },\n");
9205 fprintf (stream, " {");
9206 if (t.level2_size << t.q > 8)
9207 fprintf (stream, "\n ");
9208 for (i = 0; i < t.level2_size << t.q; i++)
9211 if (i > 0 && (i % 8) == 0)
9212 fprintf (stream, "\n ");
9213 offset = ((uint32_t *) (t.result + level2_offset))[i];
9215 fprintf (stream, " %5d", -1);
9217 fprintf (stream, " %5zu",
9218 (offset - level3_offset) / sizeof (int32_t));
9219 if (i+1 < t.level2_size << t.q)
9220 fprintf (stream, ",");
9222 if (t.level2_size << t.q > 8)
9223 fprintf (stream, "\n ");
9224 fprintf (stream, " },\n");
9225 fprintf (stream, " {");
9226 if (t.level3_size << t.p > 8)
9227 fprintf (stream, "\n ");
9228 for (i = 0; i < t.level3_size << t.p; i++)
9230 if (i > 0 && (i % 8) == 0)
9231 fprintf (stream, "\n ");
9232 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
9233 if (i+1 < t.level3_size << t.p)
9234 fprintf (stream, ",");
9236 if (t.level3_size << t.p > 8)
9237 fprintf (stream, "\n ");
9238 fprintf (stream, " }\n");
9239 fprintf (stream, "};\n");
9241 if (ferror (stream) || fclose (stream))
9243 fprintf (stderr, "error writing to '%s'\n", filename);
9248 /* ========================================================================= */
9250 /* A special casing context.
9251 A context is negated through x -> -x. */
9256 SCC_AFTER_SOFT_DOTTED,
9262 /* A special casing rule. */
9263 struct special_casing_rule
9266 unsigned int lower_mapping[3];
9267 unsigned int title_mapping[3];
9268 unsigned int upper_mapping[3];
9269 unsigned int casefold_mapping[3];
9270 const char *language;
9274 /* The special casing rules. */
9275 struct special_casing_rule **casing_rules;
9276 unsigned int num_casing_rules;
9277 unsigned int allocated_casing_rules;
9280 add_casing_rule (struct special_casing_rule *new_rule)
9282 if (num_casing_rules == allocated_casing_rules)
9284 allocated_casing_rules = 2 * allocated_casing_rules;
9285 if (allocated_casing_rules < 16)
9286 allocated_casing_rules = 16;
9288 (struct special_casing_rule **)
9289 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
9291 casing_rules[num_casing_rules++] = new_rule;
9294 /* Stores in casing_rules the special casing rules found in
9295 specialcasing_filename. */
9297 fill_casing_rules (const char *specialcasing_filename)
9301 stream = fopen (specialcasing_filename, "r");
9304 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
9308 casing_rules = NULL;
9309 num_casing_rules = 0;
9310 allocated_casing_rules = 0;
9320 unsigned int lower_mapping[3];
9321 unsigned int title_mapping[3];
9322 unsigned int upper_mapping[3];
9326 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9329 if (buf[0] == '\0' || buf[0] == '#')
9334 code = strtoul (scanptr, &endptr, 16);
9335 if (endptr == scanptr)
9337 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9341 if (*scanptr != ';')
9343 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9348 /* Scan lower mapping. */
9349 for (i = 0; i < 3; i++)
9350 lower_mapping[i] = 0;
9351 for (i = 0; i < 3; i++)
9353 while (*scanptr == ' ')
9355 if (*scanptr == ';')
9357 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
9358 if (endptr == scanptr)
9360 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9365 if (*scanptr != ';')
9367 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9372 /* Scan title mapping. */
9373 for (i = 0; i < 3; i++)
9374 title_mapping[i] = 0;
9375 for (i = 0; i < 3; i++)
9377 while (*scanptr == ' ')
9379 if (*scanptr == ';')
9381 title_mapping[i] = strtoul (scanptr, &endptr, 16);
9382 if (endptr == scanptr)
9384 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9389 if (*scanptr != ';')
9391 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9396 /* Scan upper mapping. */
9397 for (i = 0; i < 3; i++)
9398 upper_mapping[i] = 0;
9399 for (i = 0; i < 3; i++)
9401 while (*scanptr == ' ')
9403 if (*scanptr == ';')
9405 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
9406 if (endptr == scanptr)
9408 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9413 if (*scanptr != ';')
9415 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9420 /* Scan language and context. */
9422 context = SCC_ALWAYS;
9423 while (*scanptr == ' ')
9425 if (*scanptr != '\0' && *scanptr != '#')
9427 const char *word_begin = scanptr;
9428 const char *word_end;
9430 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9434 while (*scanptr == ' ')
9437 if (word_end - word_begin == 2)
9439 language = (char *) malloc ((word_end - word_begin) + 1);
9440 memcpy (language, word_begin, 2);
9441 language[word_end - word_begin] = '\0';
9442 word_begin = word_end = NULL;
9444 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9446 word_begin = scanptr;
9447 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9453 if (word_end > word_begin)
9455 bool negate = false;
9457 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9462 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9463 context = SCC_FINAL_SIGMA;
9464 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9465 context = SCC_AFTER_SOFT_DOTTED;
9466 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9467 context = SCC_MORE_ABOVE;
9468 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9469 context = SCC_BEFORE_DOT;
9470 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9471 context = SCC_AFTER_I;
9474 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9478 context = - context;
9481 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9483 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9488 /* Store the rule. */
9490 struct special_casing_rule *new_rule =
9491 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9492 new_rule->code = code;
9493 new_rule->language = language;
9494 new_rule->context = context;
9495 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9496 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9497 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9499 add_casing_rule (new_rule);
9503 if (ferror (stream) || fclose (stream))
9505 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9510 /* A casefolding rule. */
9511 struct casefold_rule
9514 unsigned int mapping[3];
9515 const char *language;
9518 /* The casefolding rules. */
9519 struct casefold_rule **casefolding_rules;
9520 unsigned int num_casefolding_rules;
9521 unsigned int allocated_casefolding_rules;
9523 /* Stores in casefolding_rules the case folding rules found in
9524 casefolding_filename. */
9526 fill_casefolding_rules (const char *casefolding_filename)
9530 stream = fopen (casefolding_filename, "r");
9533 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9537 casefolding_rules = NULL;
9538 num_casefolding_rules = 0;
9539 allocated_casefolding_rules = 0;
9550 unsigned int mapping[3];
9552 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9555 if (buf[0] == '\0' || buf[0] == '#')
9560 code = strtoul (scanptr, &endptr, 16);
9561 if (endptr == scanptr)
9563 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9567 if (*scanptr != ';')
9569 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9575 while (*scanptr == ' ')
9580 case 'C': case 'F': case 'S': case 'T':
9584 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9588 if (*scanptr != ';')
9590 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9595 /* Scan casefold mapping. */
9596 for (i = 0; i < 3; i++)
9598 for (i = 0; i < 3; i++)
9600 while (*scanptr == ' ')
9602 if (*scanptr == ';')
9604 mapping[i] = strtoul (scanptr, &endptr, 16);
9605 if (endptr == scanptr)
9607 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9612 if (*scanptr != ';')
9614 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9619 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9622 const char * const *languages;
9623 unsigned int languages_count;
9625 /* Type 'T' indicates that the rule is applicable to Turkish
9629 static const char * const turkish_languages[] = { "tr", "az" };
9630 languages = turkish_languages;
9631 languages_count = 2;
9635 static const char * const all_languages[] = { NULL };
9636 languages = all_languages;
9637 languages_count = 1;
9640 for (i = 0; i < languages_count; i++)
9642 /* Store a new rule. */
9643 struct casefold_rule *new_rule =
9644 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9645 new_rule->code = code;
9646 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9647 new_rule->language = languages[i];
9649 if (num_casefolding_rules == allocated_casefolding_rules)
9651 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9652 if (allocated_casefolding_rules < 16)
9653 allocated_casefolding_rules = 16;
9655 (struct casefold_rule **)
9656 realloc (casefolding_rules,
9657 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9659 casefolding_rules[num_casefolding_rules++] = new_rule;
9664 if (ferror (stream) || fclose (stream))
9666 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9671 /* Casefold mapping, when it maps to a single character. */
9672 unsigned int unicode_casefold[0x110000];
9675 to_casefold (unsigned int ch)
9677 return unicode_casefold[ch];
9680 /* Redistribute the casefolding_rules:
9681 - Rules that map to a single character, language independently, are stored
9682 in unicode_casefold.
9683 - Other rules are merged into casing_rules. */
9685 redistribute_casefolding_rules (void)
9687 unsigned int ch, i, j;
9689 /* Fill unicode_casefold[]. */
9690 for (ch = 0; ch < 0x110000; ch++)
9691 unicode_casefold[ch] = ch;
9692 for (i = 0; i < num_casefolding_rules; i++)
9694 struct casefold_rule *cfrule = casefolding_rules[i];
9696 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
9699 assert (ch < 0x110000);
9700 unicode_casefold[ch] = cfrule->mapping[0];
9704 /* Extend the special casing rules by filling in their casefold_mapping[]
9706 for (j = 0; j < num_casing_rules; j++)
9708 struct special_casing_rule *rule = casing_rules[j];
9711 rule->casefold_mapping[0] = to_casefold (rule->code);
9712 for (k = 1; k < 3; k++)
9713 rule->casefold_mapping[k] = 0;
9716 /* Now merge the other casefolding rules into casing_rules. */
9717 for (i = 0; i < num_casefolding_rules; i++)
9719 struct casefold_rule *cfrule = casefolding_rules[i];
9721 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
9723 /* Find a rule that applies to the same code, same language, and it
9724 has context SCC_ALWAYS. At the same time, update all rules that
9725 have the same code and same or more specific language. */
9726 struct special_casing_rule *found_rule = NULL;
9728 for (j = 0; j < num_casing_rules; j++)
9730 struct special_casing_rule *rule = casing_rules[j];
9732 if (rule->code == cfrule->code
9733 && (cfrule->language == NULL
9734 || (rule->language != NULL
9735 && strcmp (rule->language, cfrule->language) == 0)))
9737 memcpy (rule->casefold_mapping, cfrule->mapping,
9738 sizeof (rule->casefold_mapping));
9740 if ((cfrule->language == NULL
9741 ? rule->language == NULL
9742 : rule->language != NULL
9743 && strcmp (rule->language, cfrule->language) == 0)
9744 && rule->context == SCC_ALWAYS)
9752 if (found_rule == NULL)
9754 /* Create a new rule. */
9755 struct special_casing_rule *new_rule =
9756 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9758 /* Try to find a rule that applies to the same code, no language
9759 restriction, and with context SCC_ALWAYS. */
9760 for (j = 0; j < num_casing_rules; j++)
9762 struct special_casing_rule *rule = casing_rules[j];
9764 if (rule->code == cfrule->code
9765 && rule->context == SCC_ALWAYS
9766 && rule->language == NULL)
9774 new_rule->code = cfrule->code;
9775 new_rule->language = cfrule->language;
9776 new_rule->context = SCC_ALWAYS;
9777 if (found_rule != NULL)
9779 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
9780 sizeof (new_rule->lower_mapping));
9781 memcpy (new_rule->title_mapping, found_rule->title_mapping,
9782 sizeof (new_rule->title_mapping));
9783 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
9784 sizeof (new_rule->upper_mapping));
9790 new_rule->lower_mapping[0] = to_lower (cfrule->code);
9791 for (k = 1; k < 3; k++)
9792 new_rule->lower_mapping[k] = 0;
9793 new_rule->title_mapping[0] = to_title (cfrule->code);
9794 for (k = 1; k < 3; k++)
9795 new_rule->title_mapping[k] = 0;
9796 new_rule->upper_mapping[0] = to_upper (cfrule->code);
9797 for (k = 1; k < 3; k++)
9798 new_rule->upper_mapping[k] = 0;
9800 memcpy (new_rule->casefold_mapping, cfrule->mapping,
9801 sizeof (new_rule->casefold_mapping));
9803 add_casing_rule (new_rule);
9810 compare_casing_rules (const void *a, const void *b)
9812 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
9813 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
9814 unsigned int a_code = a_rule->code;
9815 unsigned int b_code = b_rule->code;
9817 if (a_code < b_code)
9819 if (a_code > b_code)
9822 /* Sort the more specific rules before the more general ones. */
9823 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
9824 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
9828 sort_casing_rules (void)
9830 /* Sort the rules 1. by code, 2. by specificity. */
9831 if (num_casing_rules > 1)
9832 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
9833 compare_casing_rules);
9836 /* Output the special casing rules. */
9838 output_casing_rules (const char *filename, const char *version)
9844 stream = fopen (filename, "w");
9847 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9851 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9852 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
9853 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9855 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
9856 fprintf (stream, "%%struct-type\n");
9857 fprintf (stream, "%%language=ANSI-C\n");
9858 fprintf (stream, "%%define slot-name code\n");
9859 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
9860 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
9861 fprintf (stream, "%%compare-lengths\n");
9862 fprintf (stream, "%%compare-strncmp\n");
9863 fprintf (stream, "%%readonly-tables\n");
9864 fprintf (stream, "%%omit-struct-type\n");
9865 fprintf (stream, "%%%%\n");
9868 for (i = 0; i < num_casing_rules; i++)
9870 struct special_casing_rule *rule = casing_rules[i];
9873 if (i > 0 && rule->code == casing_rules[i - 1]->code)
9878 if (!(rule->code < 0x10000))
9880 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
9884 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
9885 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
9887 fprintf (stream, "%d, ",
9888 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
9890 context = rule->context;
9893 fprintf (stream, "-");
9894 context = - context;
9897 fprintf (stream, " ");
9901 fprintf (stream, "SCC_ALWAYS ");
9903 case SCC_FINAL_SIGMA:
9904 fprintf (stream, "SCC_FINAL_SIGMA ");
9906 case SCC_AFTER_SOFT_DOTTED:
9907 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
9909 case SCC_MORE_ABOVE:
9910 fprintf (stream, "SCC_MORE_ABOVE ");
9912 case SCC_BEFORE_DOT:
9913 fprintf (stream, "SCC_BEFORE_DOT ");
9916 fprintf (stream, "SCC_AFTER_I ");
9921 fprintf (stream, ", ");
9923 if (rule->language != NULL)
9925 assert (strlen (rule->language) == 2);
9926 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
9929 fprintf (stream, "{ '\\0', '\\0' }, ");
9931 fprintf (stream, "{ ");
9932 for (j = 0; j < 3; j++)
9935 fprintf (stream, ", ");
9936 if (!(rule->upper_mapping[j] < 0x10000))
9938 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
9941 if (rule->upper_mapping[j] != 0)
9942 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
9944 fprintf (stream, " 0");
9946 fprintf (stream, " }, { ");
9947 for (j = 0; j < 3; j++)
9950 fprintf (stream, ", ");
9951 if (!(rule->lower_mapping[j] < 0x10000))
9953 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
9956 if (rule->lower_mapping[j] != 0)
9957 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
9959 fprintf (stream, " 0");
9961 fprintf (stream, " }, { ");
9962 for (j = 0; j < 3; j++)
9965 fprintf (stream, ", ");
9966 if (!(rule->title_mapping[j] < 0x10000))
9968 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
9971 if (rule->title_mapping[j] != 0)
9972 fprintf (stream, "0x%04X", rule->title_mapping[j]);
9974 fprintf (stream, " 0");
9976 fprintf (stream, " }, { ");
9977 for (j = 0; j < 3; j++)
9980 fprintf (stream, ", ");
9981 if (!(rule->casefold_mapping[j] < 0x10000))
9983 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
9986 if (rule->casefold_mapping[j] != 0)
9987 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
9989 fprintf (stream, " 0");
9991 fprintf (stream, " }\n");
9994 if (ferror (stream) || fclose (stream))
9996 fprintf (stderr, "error writing to '%s'\n", filename);
10001 /* ========================================================================= */
10003 /* Quoting the Unicode standard:
10004 Definition: A character is defined to be "cased" if it has the Lowercase
10005 or Uppercase property or has a General_Category value of
10006 Titlecase_Letter. */
10008 is_cased (unsigned int ch)
10010 return (is_property_lowercase (ch)
10011 || is_property_uppercase (ch)
10012 || is_category_Lt (ch));
10015 /* Quoting the Unicode standard:
10016 Definition: A character is defined to be "case-ignorable" if it has the
10017 value MidLetter {or the value MidNumLet} for the Word_Break property or
10018 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
10019 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10020 The text marked in braces was added in Unicode 5.1.0, see
10021 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
10022 Definition of case-ignorable". */
10023 /* Since this predicate is only used for the "Before C" and "After C"
10024 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
10025 This simplifies the evaluation of the regular expressions
10026 \p{cased} (\p{case-ignorable})* C
10028 C (\p{case-ignorable})* \p{cased}
10031 is_case_ignorable (unsigned int ch)
10033 return (unicode_org_wbp[ch] == WBP_MIDLETTER
10034 || unicode_org_wbp[ch] == WBP_MIDNUMLET
10035 || is_category_Mn (ch)
10036 || is_category_Me (ch)
10037 || is_category_Cf (ch)
10038 || is_category_Lm (ch)
10039 || is_category_Sk (ch))
10043 /* ------------------------------------------------------------------------- */
10045 /* Output all case related properties. */
10047 output_casing_properties (const char *version)
10049 #define PROPERTY(FN,P) \
10050 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10051 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10052 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10053 PROPERTY(cased, cased)
10054 PROPERTY(ignorable, case_ignorable)
10058 /* ========================================================================= */
10061 main (int argc, char * argv[])
10063 const char *unicodedata_filename;
10064 const char *proplist_filename;
10065 const char *derivedproplist_filename;
10066 const char *arabicshaping_filename;
10067 const char *scripts_filename;
10068 const char *blocks_filename;
10069 const char *proplist30_filename;
10070 const char *eastasianwidth_filename;
10071 const char *linebreak_filename;
10072 const char *wordbreakproperty_filename;
10073 const char *graphemebreakproperty_filename;
10074 const char *compositionexclusions_filename;
10075 const char *specialcasing_filename;
10076 const char *casefolding_filename;
10077 const char *version;
10081 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10086 unicodedata_filename = argv[1];
10087 proplist_filename = argv[2];
10088 derivedproplist_filename = argv[3];
10089 arabicshaping_filename = argv[4];
10090 scripts_filename = argv[5];
10091 blocks_filename = argv[6];
10092 proplist30_filename = argv[7];
10093 eastasianwidth_filename = argv[8];
10094 linebreak_filename = argv[9];
10095 wordbreakproperty_filename = argv[10];
10096 graphemebreakproperty_filename = argv[11];
10097 compositionexclusions_filename = argv[12];
10098 specialcasing_filename = argv[13];
10099 casefolding_filename = argv[14];
10100 version = argv[15];
10102 fill_attributes (unicodedata_filename);
10103 clear_properties ();
10104 fill_properties (proplist_filename);
10105 fill_properties (derivedproplist_filename);
10106 fill_properties30 (proplist30_filename);
10107 fill_arabicshaping (arabicshaping_filename);
10108 fill_scripts (scripts_filename);
10109 fill_blocks (blocks_filename);
10110 fill_width (eastasianwidth_filename);
10111 fill_org_lbp (linebreak_filename);
10112 fill_org_wbp (wordbreakproperty_filename);
10113 fill_org_gbp (graphemebreakproperty_filename);
10114 fill_composition_exclusions (compositionexclusions_filename);
10115 fill_casing_rules (specialcasing_filename);
10116 fill_casefolding_rules (casefolding_filename);
10117 redistribute_casefolding_rules ();
10118 sort_casing_rules ();
10120 output_categories (version);
10121 output_category ("unictype/categ_of.h", version);
10122 output_combclass ("unictype/combiningclass.h", version);
10123 output_bidi_category ("unictype/bidi_of.h", version);
10124 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
10125 output_decimal_digit ("unictype/decdigit.h", version);
10126 output_digit_test ("../tests/unictype/test-digit.h", version);
10127 output_digit ("unictype/digit.h", version);
10128 output_numeric_test ("../tests/unictype/test-numeric.h", version);
10129 output_numeric ("unictype/numeric.h", version);
10130 output_mirror ("unictype/mirror.h", version);
10131 output_properties (version);
10132 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
10133 output_joining_type ("unictype/joiningtype_of.h", version);
10134 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
10135 output_joining_group ("unictype/joininggroup_of.h", version);
10137 output_scripts (version);
10138 output_scripts_byname (version);
10139 output_blocks (version);
10140 output_ident_properties (version);
10141 output_nonspacing_property ("uniwidth/width.c.part");
10142 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10143 output_old_ctype (version);
10145 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10146 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10147 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
10149 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10150 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10151 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
10153 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10154 output_gbp_table ("unigbrk/gbrkprop.h", version);
10156 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
10157 debug_output_composition_tables ("uninorm/composition.txt");
10158 output_composition_tables ("uninorm/composition-table.gperf", version);
10160 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
10161 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
10162 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
10163 output_simple_mapping ("unicase/toupper.h", to_upper, version);
10164 output_simple_mapping ("unicase/tolower.h", to_lower, version);
10165 output_simple_mapping ("unicase/totitle.h", to_title, version);
10166 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
10167 output_casing_rules ("unicase/special-casing-table.gperf", version);
10168 output_casing_properties (version);
10176 * compile-command: "\
10177 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
10178 * ./gen-uni-tables \\
10179 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \\
10180 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \\
10181 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \\
10182 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \\
10183 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \\
10184 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \\
10185 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
10186 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \\
10187 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \\
10188 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \\
10189 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
10190 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \\
10191 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \\
10192 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \\
10194 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
10195 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt"