3 * oFono - Open Source Telephony
5 * Copyright (C) 2008-2011 Intel Corporation. All rights reserved.
6 * Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies).
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
37 Name: GSM 03.38 to Unicode
40 Table format: Format A
46 Copyright (c) 2000 Unicode, Inc. All Rights reserved.
48 This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
49 No claims are made as to fitness for any particular purpose. No
50 warranties of any kind are expressed or implied. The recipient
51 agrees to determine applicability of information provided. If this
52 file has been provided on optical media by Unicode, Inc., the sole
53 remedy for any claim will be exchange of defective media within 90
56 Unicode, Inc. hereby grants the right to freely use the information
57 supplied in this file in the creation of products supporting the
58 Unicode Standard, and to make copies of this file in any form for
59 internal or external distribution as long as this notice remains
65 #define UTF8_LENGTH(c) \
66 ((c) < 0x80 ? 1 : ((c) < 0x800 ? 2 : 3))
68 #define TABLE_SIZE(t) \
69 (sizeof((t)) / sizeof(struct codepoint))
76 struct conversion_table {
77 /* To unicode locking shift table */
78 const struct codepoint *locking_u;
79 unsigned int locking_len_u;
81 /* To unicode single shift table */
82 const struct codepoint *single_u;
83 unsigned int single_len_u;
85 /* To GSM locking shift table, fixed size */
86 const unsigned short *locking_g;
88 /* To GSM single shift table */
89 const struct codepoint *single_g;
90 unsigned int single_len_g;
93 /* GSM to Unicode extension table, for GSM sequences starting with 0x1B */
94 static const struct codepoint def_ext_gsm[] = {
95 { 0x0A, 0x000C }, /* See NOTE 3 in 23.038 */
97 { 0x1B, 0x0020 }, /* See NOTE 1 in 23.038 */
108 static const struct codepoint def_ext_unicode[] = {
121 /* Appendix A.2.1. in 3GPP TS23.038, V.8.2.0 */
122 static const struct codepoint tur_ext_gsm[] = {
123 { 0x0A, 0x000C }, /* See NOTE 3 */
125 { 0x1B, 0x0020 }, /* See NOTE 1 */
143 static const struct codepoint tur_ext_unicode[] = {
163 /* Appendix A.2.2. in 3GPP TS23.038 V.8.2.0*/
164 static const struct codepoint spa_ext_gsm[] = {
166 { 0x0A, 0x000C }, /* See NOTE 3 */
168 { 0x1B, 0x0020 }, /* See NOTE 1 */
187 static const struct codepoint spa_ext_unicode[] = {
209 /* Appendix A.2.3. in 3GPP TS23.038 V.8.2.0 */
210 static const struct codepoint por_ext_gsm[] = {
213 { 0x0A, 0x000C }, /* See NOTE 3 */
226 { 0x1B, 0x0020 }, /* See NOTE 1 */
251 static const struct codepoint por_ext_unicode[] = {
291 /* Used for conversion of GSM to Unicode */
292 static const unsigned short def_gsm[] = {
293 0x0040, 0x00A3, 0x0024, 0x00A5, 0x00E8, 0x00E9, 0x00F9, 0x00EC,
294 0x00F2, 0x00C7, 0x000A, 0x00D8, 0x00F8, 0x000D, 0x00C5, 0x00E5,
295 0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8,
296 0x03A3, 0x0398, 0x039E, 0x00A0, 0x00C6, 0x00E6, 0x00DF, 0x00C9,
297 0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027,
298 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
299 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
300 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
301 0x00A1, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
302 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
303 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
304 0x0058, 0x0059, 0x005A, 0x00C4, 0x00D6, 0x00D1, 0x00DC, 0x00A7,
305 0x00BF, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
306 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
307 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
308 0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0
311 /* Used for conversion of Unicode to GSM */
312 static const struct codepoint def_unicode[] = {
313 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
314 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
315 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
316 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
317 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
318 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
319 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
320 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
321 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
322 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
323 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
324 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
325 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
326 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
327 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
328 { 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
329 { 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
330 { 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
331 { 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
332 { 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
333 { 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
334 { 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
335 { 0x00A0, 0x20 }, { 0x00A1, 0x40 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 },
336 { 0x00A5, 0x03 }, { 0x00A7, 0x5F }, { 0x00BF, 0x60 }, { 0x00C4, 0x5B },
337 { 0x00C5, 0x0E }, { 0x00C6, 0x1C }, { 0x00C7, 0x09 }, { 0x00C9, 0x1F },
338 { 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00D8, 0x0B }, { 0x00DC, 0x5E },
339 { 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
340 { 0x00E6, 0x1D }, { 0x00E8, 0x04 }, { 0x00E9, 0x05 }, { 0x00EC, 0x07 },
341 { 0x00F1, 0x7D }, { 0x00F2, 0x08 }, { 0x00F6, 0x7C }, { 0x00F8, 0x0C },
342 { 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x0393, 0x13 }, { 0x0394, 0x10 },
343 { 0x0398, 0x19 }, { 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 },
344 { 0x03A3, 0x18 }, { 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }
347 /* Appendix A.3.1 in 3GPP TS23.038 */
348 static const unsigned short tur_gsm[] = {
349 0x0040, 0x00A3, 0x0024, 0x00A5, 0x20AC, 0x00E9, 0x00F9, 0x0131,
350 0x00F2, 0x00C7, 0x000A, 0x011E, 0x011F, 0x000D, 0x00C5, 0x00E5,
351 0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8,
352 0x03A3, 0x0398, 0x039E, 0x00A0, 0x015E, 0x015F, 0x00DF, 0x00C9,
353 0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027,
354 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
355 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
356 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
357 0x0130, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
358 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
359 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
360 0x0058, 0x0059, 0x005A, 0x00C4, 0x00D6, 0x00D1, 0x00DC, 0x00A7,
361 0x00E7, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
362 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
363 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
364 0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0
367 static const struct codepoint tur_unicode[] = {
368 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
369 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
370 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
371 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
372 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
373 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
374 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
375 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
376 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
377 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
378 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
379 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
380 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
381 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
382 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
383 { 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
384 { 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
385 { 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
386 { 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
387 { 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
388 { 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
389 { 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
390 { 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 }, { 0x00A5, 0x03 },
391 { 0x00A7, 0x5F }, { 0x00C4, 0x5B }, { 0x00C5, 0x0E }, { 0x00C7, 0x09 },
392 { 0x00C9, 0x1F }, { 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00DC, 0x5E },
393 { 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
394 { 0x00E7, 0x60 }, { 0x00E9, 0x05 }, { 0x00F1, 0x7D }, { 0x00F2, 0x08 },
395 { 0x00F6, 0x7C }, { 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x011E, 0x0B },
396 { 0x011F, 0x0C }, { 0x0130, 0x40 }, { 0x0131, 0x07 }, { 0x015E, 0x1C },
397 { 0x015F, 0x1D }, { 0x0393, 0x13 }, { 0x0394, 0x10 }, { 0x0398, 0x19 },
398 { 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 }, { 0x03A3, 0x18 },
399 { 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }, { 0x20AC, 0x04 }
402 /* Appendix A.3.2 in 3GPP TS23.038 */
403 static const unsigned short por_gsm[] = {
404 0x0040, 0x00A3, 0x0024, 0x00A5, 0x00EA, 0x00E9, 0x00FA, 0x00ED,
405 0x00F3, 0x00E7, 0x000A, 0x00D4, 0x00F4, 0x000D, 0x00C1, 0x00E1,
406 0x0394, 0x005F, 0x00AA, 0x00C7, 0x00C0, 0x221E, 0x005E, 0x005C,
407 0x20ac, 0x00D3, 0x007C, 0x00A0, 0x00C2, 0x00E2, 0x00CA, 0x00C9,
408 0x0020, 0x0021, 0x0022, 0x0023, 0x00BA, 0x0025, 0x0026, 0x0027,
409 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
410 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
411 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
412 0x00CD, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
413 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
414 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
415 0x0058, 0x0059, 0x005A, 0x00C3, 0x00D5, 0x00DA, 0x00DC, 0x00A7,
416 0x007E, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
417 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
418 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
419 0x0078, 0x0079, 0x007A, 0x00E3, 0x00F5, 0x0060, 0x00FC, 0x00E0
422 static const struct codepoint por_unicode[] = {
423 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
424 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
425 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
426 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
427 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
428 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
429 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
430 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
431 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
432 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
433 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
434 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
435 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
436 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
437 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
438 { 0x005A, 0x5A }, { 0x005C, 0x17 }, { 0x005E, 0x16 }, { 0x005F, 0x11 },
439 { 0x0060, 0x7D }, { 0x0061, 0x61 }, { 0x0062, 0x62 }, { 0x0063, 0x63 },
440 { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 }, { 0x0067, 0x67 },
441 { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A }, { 0x006B, 0x6B },
442 { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E }, { 0x006F, 0x6F },
443 { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 }, { 0x0073, 0x73 },
444 { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 }, { 0x0077, 0x77 },
445 { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A }, { 0x007C, 0x1A },
446 { 0x007E, 0x60 }, { 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A5, 0x03 },
447 { 0x00A7, 0x5F }, { 0x00AA, 0x12 }, { 0x00BA, 0x24 }, { 0x00C0, 0x14 },
448 { 0x00C1, 0x0E }, { 0x00C2, 0x1C }, { 0x00C3, 0x5B }, { 0x00C7, 0x13 },
449 { 0x00C9, 0x1F }, { 0x00CA, 0x1E }, { 0x00CD, 0x40 }, { 0x00D3, 0x19 },
450 { 0x00D4, 0x0B }, { 0x00D5, 0x5C }, { 0x00DA, 0x5D }, { 0x00DC, 0x5E },
451 { 0x00E0, 0x7F }, { 0x00E1, 0x0F }, { 0x00E2, 0x1D }, { 0x00E3, 0x7B },
452 { 0x00E7, 0x09 }, { 0x00E9, 0x05 }, { 0x00EA, 0x04 }, { 0x00ED, 0x07 },
453 { 0x00F3, 0x08 }, { 0x00F4, 0x0C }, { 0x00F5, 0x7C }, { 0x00FA, 0x06 },
454 { 0x00FC, 0x7E }, { 0x0394, 0x10 }, { 0x20AC, 0x18 }, { 0x221E, 0x15 }
457 static int compare_codepoints(const void *a, const void *b)
459 const struct codepoint *ca = (const struct codepoint *) a;
460 const struct codepoint *cb = (const struct codepoint *) b;
462 return (ca->from > cb->from) - (ca->from < cb->from);
465 static unsigned short codepoint_lookup(struct codepoint *key,
466 const struct codepoint *table,
469 struct codepoint *result = NULL;
471 result = bsearch(key, table, len, sizeof(struct codepoint),
474 return result ? result->to : GUND;
477 static unsigned short gsm_locking_shift_lookup(struct conversion_table *t,
480 return t->locking_g[k];
483 static unsigned short gsm_single_shift_lookup(struct conversion_table *t,
486 struct codepoint key = { k, 0 };
487 return codepoint_lookup(&key, t->single_g, t->single_len_g);
490 static unsigned short unicode_locking_shift_lookup(struct conversion_table *t,
493 struct codepoint key = { k, 0 };
494 return codepoint_lookup(&key, t->locking_u, t->locking_len_u);
497 static unsigned short unicode_single_shift_lookup(struct conversion_table *t,
500 struct codepoint key = { k, 0 };
501 return codepoint_lookup(&key, t->single_u, t->single_len_u);
504 static gboolean populate_locking_shift(struct conversion_table *t,
505 enum gsm_dialect lang)
508 case GSM_DIALECT_DEFAULT:
509 case GSM_DIALECT_SPANISH:
510 t->locking_g = def_gsm;
511 t->locking_u = def_unicode;
512 t->locking_len_u = TABLE_SIZE(def_unicode);
515 case GSM_DIALECT_TURKISH:
516 t->locking_g = tur_gsm;
517 t->locking_u = tur_unicode;
518 t->locking_len_u = TABLE_SIZE(tur_unicode);
521 case GSM_DIALECT_PORTUGUESE:
522 t->locking_g = por_gsm;
523 t->locking_u = por_unicode;
524 t->locking_len_u = TABLE_SIZE(por_unicode);
531 static gboolean populate_single_shift(struct conversion_table *t,
532 enum gsm_dialect lang)
535 case GSM_DIALECT_DEFAULT:
536 t->single_g = def_ext_gsm;
537 t->single_len_g = TABLE_SIZE(def_ext_gsm);
538 t->single_u = def_ext_unicode;
539 t->single_len_u = TABLE_SIZE(def_ext_unicode);
542 case GSM_DIALECT_TURKISH:
543 t->single_g = tur_ext_gsm;
544 t->single_len_g = TABLE_SIZE(tur_ext_gsm);
545 t->single_u = tur_ext_unicode;
546 t->single_len_u = TABLE_SIZE(tur_ext_unicode);
549 case GSM_DIALECT_SPANISH:
550 t->single_g = spa_ext_gsm;
551 t->single_len_g = TABLE_SIZE(spa_ext_gsm);
552 t->single_u = spa_ext_unicode;
553 t->single_len_u = TABLE_SIZE(spa_ext_unicode);
556 case GSM_DIALECT_PORTUGUESE:
557 t->single_g = por_ext_gsm;
558 t->single_len_g = TABLE_SIZE(por_ext_gsm);
559 t->single_u = por_ext_unicode;
560 t->single_len_u = TABLE_SIZE(por_ext_unicode);
567 static gboolean conversion_table_init(struct conversion_table *t,
568 enum gsm_dialect locking,
569 enum gsm_dialect single)
571 memset(t, 0, sizeof(struct conversion_table));
573 return populate_locking_shift(t, locking) &&
574 populate_single_shift(t, single);
578 * Converts text coded using GSM codec into UTF8 encoded text, using
579 * the given language identifiers for single shift and locking shift
580 * tables. If len is less than 0, and terminator character is given,
581 * the length is computed automatically.
583 * Returns newly-allocated UTF8 encoded string or NULL if the conversion
584 * could not be performed. Returns the number of bytes read from the
585 * GSM encoded string in items_read (if not NULL), not including the
586 * terminator character. Returns the number of bytes written into the UTF8
587 * encoded string in items_written (if not NULL) not including the terminal
588 * '\0' character. The caller is responsible for freeing the returned value.
590 char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len,
591 long *items_read, long *items_written,
592 unsigned char terminator,
593 enum gsm_dialect locking_lang,
594 enum gsm_dialect single_lang)
601 struct conversion_table t;
603 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
606 if (len < 0 && !terminator)
612 while (text[i] != terminator)
618 for (i = 0, res_length = 0; i < len; i++) {
624 if (text[i] == 0x1b) {
629 c = gsm_single_shift_lookup(&t, text[i]);
632 * According to the comment in the table from
633 * 3GPP 23.038, Section 6.2.1.1:
634 * "In the event that an MS receives a code where
635 * a symbol is not represented in the above table
636 * then the MS shall display either the character
637 * shown in the main GSM 7 bit default alphabet
638 * table in subclause 6.2.1., or the character from
639 * the National Language Locking Shift Table in the
640 * case where the locking shift mechanism as defined
641 * in subclause 6.2.1.2.3 is used."
644 c = gsm_locking_shift_lookup(&t, text[i]);
646 c = gsm_locking_shift_lookup(&t, text[i]);
648 res_length += UTF8_LENGTH(c);
651 res = g_try_malloc(res_length + 1);
658 while (out < res + res_length) {
661 if (text[i] == 0x1b) {
662 c = gsm_single_shift_lookup(&t, text[++i]);
665 c = gsm_locking_shift_lookup(&t, text[i]);
667 c = gsm_locking_shift_lookup(&t, text[i]);
669 out += g_unichar_to_utf8(c, out);
677 *items_written = out - res;
686 char *convert_gsm_to_utf8(const unsigned char *text, long len,
687 long *items_read, long *items_written,
688 unsigned char terminator)
690 return convert_gsm_to_utf8_with_lang(text, len, items_read,
694 GSM_DIALECT_DEFAULT);
698 * Converts UTF-8 encoded text to GSM alphabet. The result is unpacked,
699 * with the 7th bit always 0. If terminator is not 0, a terminator character
700 * is appended to the result. This should be in the range 0x80-0xf0
702 * Returns the encoded data or NULL if the data could not be encoded. The
703 * data must be freed by the caller. If items_read is not NULL, it contains
704 * the actual number of bytes read. If items_written is not NULL, contains
705 * the number of bytes written.
707 unsigned char *convert_utf8_to_gsm_with_lang(const char *text, long len,
708 long *items_read, long *items_written,
709 unsigned char terminator,
710 enum gsm_dialect locking_lang,
711 enum gsm_dialect single_lang)
713 struct conversion_table t;
717 unsigned char *res = NULL;
721 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
727 while ((len < 0 || text + len - in > 0) && *in) {
728 long max = len < 0 ? 6 : text + len - in;
729 gunichar c = g_utf8_get_char_validated(in, max);
730 unsigned short converted = GUND;
738 converted = unicode_locking_shift_lookup(&t, c);
740 if (converted == GUND)
741 converted = unicode_single_shift_lookup(&t, c);
743 if (converted == GUND)
746 if (converted & 0x1b00)
751 in = g_utf8_next_char(in);
755 res = g_try_malloc(res_len + (terminator ? 1 : 0));
761 for (i = 0; i < nchars; i++) {
762 unsigned short converted;
764 gunichar c = g_utf8_get_char(in);
766 converted = unicode_locking_shift_lookup(&t, c);
768 if (converted == GUND)
769 converted = unicode_single_shift_lookup(&t, c);
771 if (converted & 0x1b00) {
779 in = g_utf8_next_char(in);
786 *items_written = out - res;
790 *items_read = in - text;
795 unsigned char *convert_utf8_to_gsm(const char *text, long len,
796 long *items_read, long *items_written,
797 unsigned char terminator)
799 return convert_utf8_to_gsm_with_lang(text, len, items_read,
803 GSM_DIALECT_DEFAULT);
807 * Converts UTF-8 encoded text to GSM alphabet. It finds an encoding
808 * that uses the minimum set of GSM dialects based on the hint given.
810 * It first attempts to use the default dialect's single shift and
811 * locking shift tables. It then tries with only the single shift
812 * table of the hinted dialect, and finally with both the single shift
813 * and locking shift tables of the hinted dialect.
815 * Returns the encoded data or NULL if no suitable encoding could be
816 * found. The data must be freed by the caller. If items_read is not
817 * NULL, it contains the actual number of bytes read. If items_written
818 * is not NULL, it contains the number of bytes written. If
819 * used_locking and used_single are not NULL, they will contain the
820 * dialects used for the locking shift and single shift tables.
822 unsigned char *convert_utf8_to_gsm_best_lang(const char *utf8, long len,
823 long *items_read, long *items_written,
824 unsigned char terminator,
825 enum gsm_dialect hint,
826 enum gsm_dialect *used_locking,
827 enum gsm_dialect *used_single)
829 enum gsm_dialect locking = GSM_DIALECT_DEFAULT;
830 enum gsm_dialect single = GSM_DIALECT_DEFAULT;
831 unsigned char *encoded;
833 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
834 items_written, terminator,
839 if (hint == GSM_DIALECT_DEFAULT)
843 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
844 items_written, terminator,
849 /* Spanish dialect uses the default locking shift table */
850 if (hint == GSM_DIALECT_SPANISH)
854 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
855 items_written, terminator,
862 if (used_locking != NULL)
863 *used_locking = locking;
865 if (used_single != NULL)
866 *used_single = single;
872 * Decodes the hex encoded data and converts to a byte array. If terminator
873 * is not 0, the terminator character is appended to the end of the result.
874 * This might be useful for converting GSM encoded data if the CSCS is set
877 * Please note that this since GSM does allow embedded null characeters, use
878 * of the terminator or the items_writen is encouraged to find the real size
881 unsigned char *decode_hex_own_buf(const char *in, long len, long *items_written,
882 unsigned char terminator,
894 for (i = 0, j = 0; i < len; i++, j++) {
897 if (c >= '0' && c <= '9')
899 else if (c >= 'A' && c <= 'F')
908 if (c >= '0' && c <= '9')
909 b = b * 16 + c - '0';
910 else if (c >= 'A' && c <= 'F')
911 b = b * 16 + 10 + c - 'A';
927 unsigned char *decode_hex(const char *in, long len, long *items_written,
928 unsigned char terminator)
939 for (i = 0; i < len; i++) {
942 if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))
948 buf = g_new(unsigned char, (len >> 1) + (terminator ? 1 : 0));
950 return decode_hex_own_buf(in, len, items_written, terminator, buf);
954 * Encodes the data using hexadecimal characters. len can be negative,
955 * in that case the terminator is used to find the last character. This is
956 * useful for handling GSM-encoded strings which allow ASCII NULL character
959 char *encode_hex_own_buf(const unsigned char *in, long len,
960 unsigned char terminator, char *buf)
968 while (in[i] != terminator)
974 for (i = 0, j = 0; i < len; i++, j++) {
975 c = (in[i] >> 4) & 0xf;
980 buf[j] = 'A' + c - 10;
989 buf[j] = 'A' + c - 10;
997 char *encode_hex(const unsigned char *in, long len, unsigned char terminator)
1005 while (in[i] != terminator)
1011 buf = g_new(char, len * 2 + 1);
1013 return encode_hex_own_buf(in, len, terminator, buf);
1016 unsigned char *unpack_7bit_own_buf(const unsigned char *in, long len,
1017 int byte_offset, gboolean ussd,
1018 long max_to_unpack, long *items_written,
1019 unsigned char terminator,
1022 unsigned char rest = 0;
1023 unsigned char *out = buf;
1024 int bits = 7 - (byte_offset % 7);
1030 /* In the case of CB, unpack as much as possible */
1032 max_to_unpack = len * 8 / 7;
1034 for (i = 0; (i < len) && ((out-buf) < max_to_unpack); i++) {
1035 /* Grab what we have in the current octet */
1036 *out = (in[i] & ((1 << bits) - 1)) << (7 - bits);
1038 /* Append what we have from the previous octet, if any */
1041 /* Figure out the remainder */
1042 rest = (in[i] >> bits) & ((1 << (8-bits)) - 1);
1045 * We have the entire character, here we don't increate
1046 * out if this is we started at an offset. Instead
1047 * we effectively populate variable rest
1049 if (i != 0 || bits == 7)
1052 if ((out-buf) == max_to_unpack)
1056 * We expected only 1 bit from this octet, means there's 7
1057 * left, take care of them here
1070 * According to 23.038 6.1.2.3.1, last paragraph:
1071 * "If the total number of characters to be sent equals (8n-1)
1072 * where n=1,2,3 etc. then there are 7 spare bits at the end
1073 * of the message. To avoid the situation where the receiving
1074 * entity confuses 7 binary zero pad bits as the @ character,
1075 * the carriage return or <CR> character shall be used for
1076 * padding in this situation, just as for Cell Broadcast."
1078 * "The receiving entity shall remove the final <CR> character where
1079 * the message ends on an octet boundary with <CR> as the last
1082 if (ussd && (((out - buf) % 8) == 0) && (*(out - 1) == '\r'))
1089 *items_written = out - buf;
1094 unsigned char *unpack_7bit(const unsigned char *in, long len, int byte_offset,
1095 gboolean ussd, long max_to_unpack,
1096 long *items_written, unsigned char terminator)
1098 unsigned char *buf = g_new(unsigned char,
1099 len * 8 / 7 + (terminator ? 1 : 0));
1101 return unpack_7bit_own_buf(in, len, byte_offset, ussd, max_to_unpack,
1102 items_written, terminator, buf);
1105 unsigned char *pack_7bit_own_buf(const unsigned char *in, long len,
1106 int byte_offset, gboolean ussd,
1107 long *items_written,
1108 unsigned char terminator,
1111 int bits = 7 - (byte_offset % 7);
1112 unsigned char *out = buf;
1122 while (in[i] != terminator)
1128 total_bits = len * 7;
1136 for (i = 0; i < len; i++) {
1138 *out |= (in[i] & ((1 << (7 - bits)) - 1)) <<
1143 /* This is a no op when bits == 0, lets keep valgrind happy */
1145 *out = in[i] >> (7 - bits);
1154 * If <CR> is intended to be the last character and the message
1155 * (including the wanted <CR>) ends on an octet boundary, then
1156 * another <CR> must be added together with a padding bit 0. The
1157 * receiving entity will perform the carriage return function twice,
1158 * but this will not result in misoperation as the definition of
1159 * <CR> in clause 6.1.1 is identical to the definition of <CR><CR>.
1161 if (ussd && ((total_bits % 8) == 1))
1167 if (ussd && ((total_bits % 8) == 0) && (in[len - 1] == '\r')) {
1173 *items_written = out - buf;
1178 unsigned char *pack_7bit(const unsigned char *in, long len, int byte_offset,
1179 gboolean ussd, long *items_written,
1180 unsigned char terminator)
1182 int bits = 7 - (byte_offset % 7);
1187 if (len == 0 || items_written == NULL)
1193 while (in[i] != terminator)
1199 total_bits = len * 7;
1204 /* Round up number of bytes, must append <cr> if true */
1205 if (ussd && ((total_bits % 8) == 0) && (in[len - 1] == '\r'))
1206 buf = g_new(unsigned char, (total_bits + 14) / 8);
1208 buf = g_new(unsigned char, (total_bits + 7) / 8);
1210 return pack_7bit_own_buf(in, len, byte_offset, ussd, items_written,
1214 char *sim_string_to_utf8(const unsigned char *buffer, int length)
1216 struct conversion_table t;
1220 unsigned short ucs2_offset;
1226 if (conversion_table_init(&t, GSM_DIALECT_DEFAULT,
1227 GSM_DIALECT_DEFAULT) == FALSE)
1233 if (buffer[0] < 0x80) {
1235 * We have to find the real length, since on SIM file system
1236 * alpha fields are 0xff padded
1238 for (i = 0; i < length; i++)
1239 if (buffer[i] == 0xff)
1242 return convert_gsm_to_utf8(buffer, i, NULL, NULL, 0);
1245 switch (buffer[0]) {
1247 if (((length - 1) % 2) == 1) {
1248 if (buffer[length - 1] != 0xff)
1251 length = length - 1;
1254 for (i = 1; i < length; i += 2)
1255 if (buffer[i] == 0xff && buffer[i + 1] == 0xff)
1258 return g_convert((char *) buffer + 1, i - 1,
1259 "UTF-8//TRANSLIT", "UCS-2BE",
1262 if (length < 3 || (buffer[1] > (length - 3)))
1265 num_chars = buffer[1];
1266 ucs2_offset = buffer[2] << 7;
1271 if (length < 4 || buffer[1] > length - 4)
1274 num_chars = buffer[1];
1275 ucs2_offset = (buffer[2] << 8) | buffer[3];
1279 case 0xff: /* Special case of empty string */
1293 while ((i < length) && (j < num_chars)) {
1296 if (buffer[i] & 0x80) {
1297 c = (buffer[i++] & 0x7f) + ucs2_offset;
1299 if (c >= 0xd800 && c < 0xe000)
1302 res_len += UTF8_LENGTH(c);
1307 if (buffer[i] == 0x1b) {
1312 c = gsm_single_shift_lookup(&t, buffer[i++]);
1319 c = gsm_locking_shift_lookup(&t, buffer[i++]);
1323 res_len += UTF8_LENGTH(c);
1329 /* Check that the string is padded out to the length by 0xff */
1330 for (; i < length; i++)
1331 if (buffer[i] != 0xff)
1334 utf8 = g_try_malloc(res_len + 1);
1341 while (out < utf8 + res_len) {
1344 if (buffer[i] & 0x80)
1345 c = (buffer[i++] & 0x7f) + ucs2_offset;
1346 else if (buffer[i] == 0x1b) {
1348 c = gsm_single_shift_lookup(&t, buffer[i++]);
1350 c = gsm_locking_shift_lookup(&t, buffer[i++]);
1352 out += g_unichar_to_utf8(c, out);
1360 unsigned char *utf8_to_sim_string(const char *utf, int max_length,
1363 unsigned char *result;
1364 unsigned char *ucs2;
1368 result = convert_utf8_to_gsm(utf, -1, NULL, &gsm_bytes, 0);
1370 if (gsm_bytes > max_length) {
1371 gsm_bytes = max_length;
1372 while (gsm_bytes && result[gsm_bytes - 1] == 0x1b)
1376 *out_length = gsm_bytes;
1380 /* NOTE: UCS2 formats with an offset are never used */
1382 ucs2 = (guint8 *) g_convert(utf, -1, "UCS-2BE//TRANSLIT", "UTF-8",
1383 NULL, &converted, NULL);
1387 if (max_length != -1 && (int) converted + 1 > max_length)
1388 converted = (max_length - 1) & ~1;
1390 result = g_try_malloc(converted + 1);
1391 if (result == NULL) {
1396 *out_length = converted + 1;
1399 memcpy(&result[1], ucs2, converted);
1406 * Converts UCS2 encoded text to GSM alphabet. The result is unpacked,
1407 * with the 7th bit always 0. If terminator is not 0, a terminator character
1408 * is appended to the result.
1410 * Returns the encoded data or NULL if the data could not be encoded. The
1411 * data must be freed by the caller. If items_read is not NULL, it contains
1412 * the actual number of bytes read. If items_written is not NULL, contains
1413 * the number of bytes written.
1415 unsigned char *convert_ucs2_to_gsm_with_lang(const unsigned char *text,
1416 long len, long *items_read,
1417 long *items_written,
1418 unsigned char terminator,
1419 enum gsm_dialect locking_lang,
1420 enum gsm_dialect single_lang)
1422 struct conversion_table t;
1424 const unsigned char *in;
1426 unsigned char *res = NULL;
1430 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
1433 if (len < 1 || len % 2)
1439 for (i = 0; i < len; i += 2) {
1440 gunichar c = (in[i] << 8) | in[i + 1];
1441 unsigned short converted = GUND;
1446 converted = unicode_locking_shift_lookup(&t, c);
1448 if (converted == GUND)
1449 converted = unicode_single_shift_lookup(&t, c);
1451 if (converted == GUND)
1454 if (converted & 0x1b00)
1462 res = g_try_malloc(res_len + (terminator ? 1 : 0));
1469 for (i = 0; i < len; i += 2) {
1470 gunichar c = (in[i] << 8) | in[i + 1];
1471 unsigned short converted = GUND;
1473 converted = unicode_locking_shift_lookup(&t, c);
1475 if (converted == GUND)
1476 converted = unicode_single_shift_lookup(&t, c);
1478 if (converted & 0x1b00) {
1491 *items_written = out - res;
1500 unsigned char *convert_ucs2_to_gsm(const unsigned char *text, long len,
1501 long *items_read, long *items_written,
1502 unsigned char terminator)
1504 return convert_ucs2_to_gsm_with_lang(text, len, items_read,
1507 GSM_DIALECT_DEFAULT,
1508 GSM_DIALECT_DEFAULT);