3 # This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
5 inputfile="$1" # Expect UnicodeData.txt
6 outfile=archive_string_composition.h
7 pickout=/tmp/mk_unicode_composition_tbl$$.awk
8 pickout2=/tmp/mk_unicode_composition_tbl2$$.awk
9 #nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt
11 #################################################################################
13 # Append the file header of "archive_string_composition.h"
15 #################################################################################
18 cat > ${outfile} <<CR_END
20 * Copyright (c) 2011-2012 libarchive Project
21 * All rights reserved.
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
26 * 1. Redistributions of source code must retain the above copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
32 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
33 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
36 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 * This file is generated by build/utils/gen_archive_string_composition_h.sh
50 * from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
52 * See also http://unicode.org/report/tr15/
55 #ifndef __LIBARCHIVE_BUILD
56 #error This header is only to be used internally to libarchive.
59 #ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
60 #define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
62 struct unicode_composition_table {
70 #################################################################################
74 #################################################################################
75 cat > ${pickout} <<AWK_END
81 cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
83 print "static const struct unicode_composition_table u_composition_table[] = {"
90 # Output Canonical Combining Class tables used for translating NFD to NFC.
92 printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
93 printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
95 printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
96 printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
97 printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
100 for (i = 0; i <= highnum; i++) {
101 if (i != 0 && i % 32 == 0)
103 # Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
104 if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
111 # Output a macro to get a canonical combining class.
113 print "/* Get Canonical Combining Class(CCC). */"
114 printf "#define CCC(uc)\\t\\\\\n"
115 printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
116 printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
119 # Output a canonical combining class value table.
122 printf "/* The table of the value of Canonical Cimbining Class */\\n"
123 print "static const unsigned char ccc_val[][16] = {"
124 print " /* idx=0: XXXX0 - XXXXF */"
125 print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
126 for (h = 0; h <= highnum; h++) {
129 for (m = 0; m < 16; m++) {
130 if (!xx_blockmap[h, m])
133 printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
134 for (l = 0; l < 15; l++) {
135 printf "%d, ", xxx_blockmap[h, m, l]
137 printf "%d },\n", xxx_blockmap[h, m, 15]
142 # Output the index table of the canonical combining class value table.
146 printf "\\n/* The index table to ccc_val[*][16] */\\n"
147 print "static const unsigned char ccc_val_index[][16] = {"
148 print " /* idx=0: XXX00 - XXXFF */"
149 print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
150 for (h = 0; h <= highnum; h++) {
154 printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
155 for (m = 0; m < 16; m++) {
158 if (xx_blockmap[h, m]) {
168 # Output the index table to the index table of the canonical combining
171 printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
172 printf "static const unsigned char ccc_index[] = {\\n ", h
174 for (h = 0; h <= highnum; h++) {
175 if (h != 0 && h % 24 == 0)
191 for (i=0; i < length(hex); i++) {
192 x = substr(hex, i+1, 1)
211 # Collect Canonical Combining Class values.
219 high = substr(\$1, 1, length(\$1) -2)
220 highnum = hextoi(high)
221 mid = substr(\$1, length(\$1) -1, 1)
223 low = substr(\$1, length(\$1), 1)
225 blockmap[highnum] = 1
226 xx_blockmap[highnum, midnum] = 1
227 xxx_blockmap[highnum, midnum, lownum] = \$4
231 # Following code points are not decomposed in MAC OS.
236 #\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
239 #\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
242 #\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
246 # Exclusion code points specified by
247 # http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt
249 # 1. Script Specifics
251 \$1 ~/^095[89ABCDEF]\$/ {
260 \$1 ~/^0A5[9ABE]\$/ {
290 \$1 ~/^FB2[ABCDEF]\$/ {
293 \$1 ~/^FB3[012345689ABCE]\$/ {
296 \$1 ~/^FB4[01346789ABCDE]\$/ {
300 # 2. Post Composition Version precomposed characters
308 \$1 ~/^1D16[01234]\$/ {
311 \$1 ~/^1D1B[BCDEF]\$/ {
318 # 3. Singleton Decompositions
329 \$1 ~/^1F7[13579BD]\$/ {
341 \$1 ~/^1FE[3BEF]\$/ {
356 \$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
359 \$1 ~/^FA0[0-9A-D]\$/ {
362 \$1 ~/^FA1[025-9A-E]\$/ {
365 \$1 ~/^FA2[0256A-D]\$/ {
368 \$1 ~/^FA[3-5][0-9A-F]\$/ {
371 \$1 ~/^FA6[0-9A-D]\$/ {
374 \$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
380 \$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
383 \$1 ~/^2FA0[0-9A-F]\$/ {
386 \$1 ~/^2FA1[0-9A-D]\$/ {
390 # 4. Non-Starter Decompositions
402 # Output combinations for NFD ==> NFC.
404 \$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
406 if (length(\$1) == 4)
407 print "0"cp[1], "0"cp[2], "0"\$1 | cmd
409 print cp[1], cp[2], \$1 | cmd
411 if (length(\$1) == 4)
412 print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl
414 print \$1, cp[1], cp[2] >>nfdtbl
417 #################################################################################
420 #################################################################################
421 cat > ${pickout2} <<AWK_END
425 print "struct unicode_decomposition_table {"
426 print "\tuint32_t nfc;"
427 print "\tuint32_t cp1;"
428 print "\tuint32_t cp2;"
431 print "static const struct unicode_decomposition_table u_decomposition_table[] = {"
438 printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;
441 #################################################################################
445 #################################################################################
447 awk -f ${pickout} ${inputfile} >> ${outfile}
448 awk -f ${pickout2} ${nfdtmp} >> ${outfile}
449 echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}
450 echo "" >> ${outfile}
452 # Remove awk the script.