2 zip_utf-8.c -- UTF-8 support functions for libzip
3 Copyright (C) 2011-2014 Dieter Baron and Thomas Klausner
5 This file is part of libzip, a library to manipulate ZIP archives.
6 The authors can be contacted at <libzip@nih.at>
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in
15 the documentation and/or other materials provided with the
17 3. The names of the authors may not be used to endorse or promote
18 products derived from this software without specific prior
21 THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29 IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 static const zip_uint16_t _cp437_to_unicode[256] = {
42 0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
43 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
46 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
47 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
50 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
51 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
54 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
55 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
58 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
59 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
62 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
63 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
66 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
67 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
70 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
71 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
74 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
75 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
78 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
79 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
82 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
83 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
86 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
87 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
90 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
91 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
94 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
95 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
98 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
99 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
102 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
103 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
106 #define UTF_8_LEN_2_MASK 0xe0
107 #define UTF_8_LEN_2_MATCH 0xc0
108 #define UTF_8_LEN_3_MASK 0xf0
109 #define UTF_8_LEN_3_MATCH 0xe0
110 #define UTF_8_LEN_4_MASK 0xf8
111 #define UTF_8_LEN_4_MATCH 0xf0
112 #define UTF_8_CONTINUE_MASK 0xc0
113 #define UTF_8_CONTINUE_MATCH 0x80
117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
119 zip_encoding_type_t enc;
120 const zip_uint8_t *name;
121 zip_uint32_t i, j, ulen;
124 return ZIP_ENCODING_ASCII;
128 if (str->encoding != ZIP_ENCODING_UNKNOWN)
131 enc = ZIP_ENCODING_ASCII;
132 for (i=0; i<str->length; i++) {
133 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
136 enc = ZIP_ENCODING_UTF8_GUESSED;
137 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
139 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
141 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
144 enc = ZIP_ENCODING_CP437;
148 if (i + ulen >= str->length) {
149 enc = ZIP_ENCODING_CP437;
153 for (j=1; j<=ulen; j++) {
154 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
155 enc = ZIP_ENCODING_CP437;
166 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
167 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
168 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
170 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
171 return ZIP_ENCODING_ERROR;
179 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
181 if (codepoint < 0x0080)
183 if (codepoint < 0x0800)
185 if (codepoint < 0x10000)
192 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
194 if (codepoint < 0x0080) {
195 buf[0] = codepoint & 0xff;
198 if (codepoint < 0x0800) {
199 buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
200 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
203 if (codepoint < 0x10000) {
204 buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
205 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
206 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
209 buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
210 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
211 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
212 buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
218 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
219 zip_uint32_t *utf8_lenp, zip_error_t *error)
221 zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
222 zip_uint8_t *utf8buf;
223 zip_uint32_t buflen, i, offset;
232 for (i=0; i<len; i++)
233 buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
235 if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
236 zip_error_set(error, ZIP_ER_MEMORY, 0);
241 for (i=0; i<len; i++)
242 offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
245 utf8buf[buflen-1] = 0;
247 *utf8_lenp = buflen-1;