3 * oFono - Open Source Telephony
5 * Copyright (C) 2008-2011 Intel Corporation. All rights reserved.
6 * Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies).
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
37 Name: GSM 03.38 to Unicode
40 Table format: Format A
46 Copyright (c) 2000 Unicode, Inc. All Rights reserved.
48 This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
49 No claims are made as to fitness for any particular purpose. No
50 warranties of any kind are expressed or implied. The recipient
51 agrees to determine applicability of information provided. If this
52 file has been provided on optical media by Unicode, Inc., the sole
53 remedy for any claim will be exchange of defective media within 90
56 Unicode, Inc. hereby grants the right to freely use the information
57 supplied in this file in the creation of products supporting the
58 Unicode Standard, and to make copies of this file in any form for
59 internal or external distribution as long as this notice remains
65 #define UTF8_LENGTH(c) \
66 ((c) < 0x80 ? 1 : ((c) < 0x800 ? 2 : 3))
68 #define TABLE_SIZE(t) \
69 (sizeof((t)) / sizeof(struct codepoint))
76 struct conversion_table {
77 /* To unicode locking shift table */
78 const struct codepoint *locking_u;
79 unsigned int locking_len_u;
81 /* To unicode single shift table */
82 const struct codepoint *single_u;
83 unsigned int single_len_u;
85 /* To GSM locking shift table, fixed size */
86 const unsigned short *locking_g;
88 /* To GSM single shift table */
89 const struct codepoint *single_g;
90 unsigned int single_len_g;
93 /* GSM to Unicode extension table, for GSM sequences starting with 0x1B */
94 static const struct codepoint def_ext_gsm[] = {
95 { 0x0A, 0x000C }, /* See NOTE 3 in 23.038 */
97 { 0x1B, 0x0020 }, /* See NOTE 1 in 23.038 */
108 static const struct codepoint def_ext_unicode[] = {
121 /* Appendix A.2.1. in 3GPP TS23.038, V.8.2.0 */
122 static const struct codepoint tur_ext_gsm[] = {
123 { 0x0A, 0x000C }, /* See NOTE 3 */
125 { 0x1B, 0x0020 }, /* See NOTE 1 */
143 static const struct codepoint tur_ext_unicode[] = {
163 /* Appendix A.2.2. in 3GPP TS23.038 V.8.2.0*/
164 static const struct codepoint spa_ext_gsm[] = {
166 { 0x0A, 0x000C }, /* See NOTE 3 */
168 { 0x1B, 0x0020 }, /* See NOTE 1 */
187 static const struct codepoint spa_ext_unicode[] = {
209 /* Appendix A.2.3. in 3GPP TS23.038 V.8.2.0 */
210 static const struct codepoint por_ext_gsm[] = {
213 { 0x0A, 0x000C }, /* See NOTE 3 */
226 { 0x1B, 0x0020 }, /* See NOTE 1 */
251 static const struct codepoint por_ext_unicode[] = {
291 /* Used for conversion of GSM to Unicode */
292 static const unsigned short def_gsm[] = {
293 0x0040, 0x00A3, 0x0024, 0x00A5, 0x00E8, 0x00E9, 0x00F9, 0x00EC,
294 0x00F2, 0x00C7, 0x000A, 0x00D8, 0x00F8, 0x000D, 0x00C5, 0x00E5,
295 0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8,
296 0x03A3, 0x0398, 0x039E, 0x00A0, 0x00C6, 0x00E6, 0x00DF, 0x00C9,
297 0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027,
298 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
299 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
300 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
301 0x00A1, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
302 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
303 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
304 0x0058, 0x0059, 0x005A, 0x00C4, 0x00D6, 0x00D1, 0x00DC, 0x00A7,
305 0x00BF, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
306 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
307 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
308 0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0
311 /* Used for conversion of Unicode to GSM */
312 static const struct codepoint def_unicode[] = {
313 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
314 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
315 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
316 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
317 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
318 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
319 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
320 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
321 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
322 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
323 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
324 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
325 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
326 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
327 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
328 { 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
329 { 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
330 { 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
331 { 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
332 { 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
333 { 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
334 { 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
335 { 0x00A0, 0x20 }, { 0x00A1, 0x40 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 },
336 { 0x00A5, 0x03 }, { 0x00A7, 0x5F }, { 0x00BF, 0x60 }, { 0x00C4, 0x5B },
337 { 0x00C5, 0x0E }, { 0x00C6, 0x1C }, { 0x00C7, 0x09 }, { 0x00C9, 0x1F },
338 { 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00D8, 0x0B }, { 0x00DC, 0x5E },
339 { 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
340 { 0x00E6, 0x1D }, { 0x00E8, 0x04 }, { 0x00E9, 0x05 }, { 0x00EC, 0x07 },
341 { 0x00F1, 0x7D }, { 0x00F2, 0x08 }, { 0x00F6, 0x7C }, { 0x00F8, 0x0C },
342 { 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x0393, 0x13 }, { 0x0394, 0x10 },
343 { 0x0398, 0x19 }, { 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 },
344 { 0x03A3, 0x18 }, { 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }
347 /* Appendix A.3.1 in 3GPP TS23.038 */
348 static const unsigned short tur_gsm[] = {
349 0x0040, 0x00A3, 0x0024, 0x00A5, 0x20AC, 0x00E9, 0x00F9, 0x0131,
350 0x00F2, 0x00C7, 0x000A, 0x011E, 0x011F, 0x000D, 0x00C5, 0x00E5,
351 0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8,
352 0x03A3, 0x0398, 0x039E, 0x00A0, 0x015E, 0x015F, 0x00DF, 0x00C9,
353 0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027,
354 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
355 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
356 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
357 0x0130, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
358 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
359 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
360 0x0058, 0x0059, 0x005A, 0x00C4, 0x00D6, 0x00D1, 0x00DC, 0x00A7,
361 0x00E7, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
362 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
363 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
364 0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0
367 static const struct codepoint tur_unicode[] = {
368 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
369 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
370 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
371 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
372 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
373 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
374 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
375 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
376 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
377 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
378 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
379 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
380 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
381 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
382 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
383 { 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
384 { 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
385 { 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
386 { 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
387 { 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
388 { 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
389 { 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
390 { 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 }, { 0x00A5, 0x03 },
391 { 0x00A7, 0x5F }, { 0x00C4, 0x5B }, { 0x00C5, 0x0E }, { 0x00C7, 0x09 },
392 { 0x00C9, 0x1F }, { 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00DC, 0x5E },
393 { 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
394 { 0x00E7, 0x60 }, { 0x00E9, 0x05 }, { 0x00F1, 0x7D }, { 0x00F2, 0x08 },
395 { 0x00F6, 0x7C }, { 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x011E, 0x0B },
396 { 0x011F, 0x0C }, { 0x0130, 0x40 }, { 0x0131, 0x07 }, { 0x015E, 0x1C },
397 { 0x015F, 0x1D }, { 0x0393, 0x13 }, { 0x0394, 0x10 }, { 0x0398, 0x19 },
398 { 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 }, { 0x03A3, 0x18 },
399 { 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }, { 0x20AC, 0x04 }
402 /* Appendix A.3.2 in 3GPP TS23.038 */
403 static const unsigned short por_gsm[] = {
404 0x0040, 0x00A3, 0x0024, 0x00A5, 0x00EA, 0x00E9, 0x00FA, 0x00ED,
405 0x00F3, 0x00E7, 0x000A, 0x00D4, 0x00F4, 0x000D, 0x00C1, 0x00E1,
406 0x0394, 0x005F, 0x00AA, 0x00C7, 0x00C0, 0x221E, 0x005E, 0x005C,
407 0x20ac, 0x00D3, 0x007C, 0x00A0, 0x00C2, 0x00E2, 0x00CA, 0x00C9,
408 0x0020, 0x0021, 0x0022, 0x0023, 0x00BA, 0x0025, 0x0026, 0x0027,
409 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
410 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
411 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
412 0x00CD, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
413 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
414 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
415 0x0058, 0x0059, 0x005A, 0x00C3, 0x00D5, 0x00DA, 0x00DC, 0x00A7,
416 0x007E, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
417 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
418 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
419 0x0078, 0x0079, 0x007A, 0x00E3, 0x00F5, 0x0060, 0x00FC, 0x00E0
422 static const struct codepoint por_unicode[] = {
423 { 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
424 { 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
425 { 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
426 { 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
427 { 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
428 { 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
429 { 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
430 { 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
431 { 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
432 { 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
433 { 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
434 { 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
435 { 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
436 { 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
437 { 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
438 { 0x005A, 0x5A }, { 0x005C, 0x17 }, { 0x005E, 0x16 }, { 0x005F, 0x11 },
439 { 0x0060, 0x7D }, { 0x0061, 0x61 }, { 0x0062, 0x62 }, { 0x0063, 0x63 },
440 { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 }, { 0x0067, 0x67 },
441 { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A }, { 0x006B, 0x6B },
442 { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E }, { 0x006F, 0x6F },
443 { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 }, { 0x0073, 0x73 },
444 { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 }, { 0x0077, 0x77 },
445 { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A }, { 0x007C, 0x1A },
446 { 0x007E, 0x60 }, { 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A5, 0x03 },
447 { 0x00A7, 0x5F }, { 0x00AA, 0x12 }, { 0x00BA, 0x24 }, { 0x00C0, 0x14 },
448 { 0x00C1, 0x0E }, { 0x00C2, 0x1C }, { 0x00C3, 0x5B }, { 0x00C7, 0x13 },
449 { 0x00C9, 0x1F }, { 0x00CA, 0x1E }, { 0x00CD, 0x40 }, { 0x00D3, 0x19 },
450 { 0x00D4, 0x0B }, { 0x00D5, 0x5C }, { 0x00DA, 0x5D }, { 0x00DC, 0x5E },
451 { 0x00E0, 0x7F }, { 0x00E1, 0x0F }, { 0x00E2, 0x1D }, { 0x00E3, 0x7B },
452 { 0x00E7, 0x09 }, { 0x00E9, 0x05 }, { 0x00EA, 0x04 }, { 0x00ED, 0x07 },
453 { 0x00F3, 0x08 }, { 0x00F4, 0x0C }, { 0x00F5, 0x7C }, { 0x00FA, 0x06 },
454 { 0x00FC, 0x7E }, { 0x0394, 0x10 }, { 0x20AC, 0x18 }, { 0x221E, 0x15 }
457 static int compare_codepoints(const void *a, const void *b)
459 const struct codepoint *ca = (const struct codepoint *) a;
460 const struct codepoint *cb = (const struct codepoint *) b;
462 return (ca->from > cb->from) - (ca->from < cb->from);
465 static unsigned short codepoint_lookup(struct codepoint *key,
466 const struct codepoint *table,
469 struct codepoint *result = NULL;
471 result = bsearch(key, table, len, sizeof(struct codepoint),
474 return result ? result->to : GUND;
477 static unsigned short gsm_locking_shift_lookup(struct conversion_table *t,
480 return t->locking_g[k];
483 static unsigned short gsm_single_shift_lookup(struct conversion_table *t,
486 struct codepoint key = { k, 0 };
487 return codepoint_lookup(&key, t->single_g, t->single_len_g);
490 static unsigned short unicode_locking_shift_lookup(struct conversion_table *t,
493 struct codepoint key = { k, 0 };
494 return codepoint_lookup(&key, t->locking_u, t->locking_len_u);
497 static unsigned short unicode_single_shift_lookup(struct conversion_table *t,
500 struct codepoint key = { k, 0 };
501 return codepoint_lookup(&key, t->single_u, t->single_len_u);
504 static gboolean populate_locking_shift(struct conversion_table *t,
505 enum gsm_dialect lang)
508 case GSM_DIALECT_DEFAULT:
509 case GSM_DIALECT_SPANISH:
510 t->locking_g = def_gsm;
511 t->locking_u = def_unicode;
512 t->locking_len_u = TABLE_SIZE(def_unicode);
515 case GSM_DIALECT_TURKISH:
516 t->locking_g = tur_gsm;
517 t->locking_u = tur_unicode;
518 t->locking_len_u = TABLE_SIZE(tur_unicode);
521 case GSM_DIALECT_PORTUGUESE:
522 t->locking_g = por_gsm;
523 t->locking_u = por_unicode;
524 t->locking_len_u = TABLE_SIZE(por_unicode);
531 static gboolean populate_single_shift(struct conversion_table *t,
532 enum gsm_dialect lang)
535 case GSM_DIALECT_DEFAULT:
536 t->single_g = def_ext_gsm;
537 t->single_len_g = TABLE_SIZE(def_ext_gsm);
538 t->single_u = def_ext_unicode;
539 t->single_len_u = TABLE_SIZE(def_ext_unicode);
542 case GSM_DIALECT_TURKISH:
543 t->single_g = tur_ext_gsm;
544 t->single_len_g = TABLE_SIZE(tur_ext_gsm);
545 t->single_u = tur_ext_unicode;
546 t->single_len_u = TABLE_SIZE(tur_ext_unicode);
549 case GSM_DIALECT_SPANISH:
550 t->single_g = spa_ext_gsm;
551 t->single_len_g = TABLE_SIZE(spa_ext_gsm);
552 t->single_u = spa_ext_unicode;
553 t->single_len_u = TABLE_SIZE(spa_ext_unicode);
556 case GSM_DIALECT_PORTUGUESE:
557 t->single_g = por_ext_gsm;
558 t->single_len_g = TABLE_SIZE(por_ext_gsm);
559 t->single_u = por_ext_unicode;
560 t->single_len_u = TABLE_SIZE(por_ext_unicode);
567 static gboolean conversion_table_init(struct conversion_table *t,
568 enum gsm_dialect locking,
569 enum gsm_dialect single)
571 memset(t, 0, sizeof(struct conversion_table));
573 return populate_locking_shift(t, locking) &&
574 populate_single_shift(t, single);
578 * Converts text coded using GSM codec into UTF8 encoded text, using
579 * the given language identifiers for single shift and locking shift
580 * tables. If len is less than 0, and terminator character is given,
581 * the length is computed automatically.
583 * Returns newly-allocated UTF8 encoded string or NULL if the conversion
584 * could not be performed. Returns the number of bytes read from the
585 * GSM encoded string in items_read (if not NULL), not including the
586 * terminator character. Returns the number of bytes written into the UTF8
587 * encoded string in items_written (if not NULL) not including the terminal
588 * '\0' character. The caller is responsible for freeing the returned value.
590 char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len,
591 long *items_read, long *items_written,
592 unsigned char terminator,
593 enum gsm_dialect locking_lang,
594 enum gsm_dialect single_lang)
601 struct conversion_table t;
603 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
606 if (len < 0 && !terminator)
612 while (text[i] != terminator)
618 for (i = 0, res_length = 0; i < len; i++) {
624 if (text[i] == 0x1b) {
629 c = gsm_single_shift_lookup(&t, text[i]);
634 c = gsm_locking_shift_lookup(&t, text[i]);
637 res_length += UTF8_LENGTH(c);
640 res = g_try_malloc(res_length + 1);
647 while (out < res + res_length) {
651 c = gsm_single_shift_lookup(&t, text[++i]);
653 c = gsm_locking_shift_lookup(&t, text[i]);
655 out += g_unichar_to_utf8(c, out);
663 *items_written = out - res;
672 char *convert_gsm_to_utf8(const unsigned char *text, long len,
673 long *items_read, long *items_written,
674 unsigned char terminator)
676 return convert_gsm_to_utf8_with_lang(text, len, items_read,
680 GSM_DIALECT_DEFAULT);
684 * Converts UTF-8 encoded text to GSM alphabet. The result is unpacked,
685 * with the 7th bit always 0. If terminator is not 0, a terminator character
686 * is appended to the result. This should be in the range 0x80-0xf0
688 * Returns the encoded data or NULL if the data could not be encoded. The
689 * data must be freed by the caller. If items_read is not NULL, it contains
690 * the actual number of bytes read. If items_written is not NULL, contains
691 * the number of bytes written.
693 unsigned char *convert_utf8_to_gsm_with_lang(const char *text, long len,
694 long *items_read, long *items_written,
695 unsigned char terminator,
696 enum gsm_dialect locking_lang,
697 enum gsm_dialect single_lang)
699 struct conversion_table t;
703 unsigned char *res = NULL;
707 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
713 while ((len < 0 || text + len - in > 0) && *in) {
714 long max = len < 0 ? 6 : text + len - in;
715 gunichar c = g_utf8_get_char_validated(in, max);
716 unsigned short converted = GUND;
724 converted = unicode_locking_shift_lookup(&t, c);
726 if (converted == GUND)
727 converted = unicode_single_shift_lookup(&t, c);
729 if (converted == GUND)
732 if (converted & 0x1b00)
737 in = g_utf8_next_char(in);
741 res = g_try_malloc(res_len + (terminator ? 1 : 0));
747 for (i = 0; i < nchars; i++) {
748 unsigned short converted;
750 gunichar c = g_utf8_get_char(in);
752 converted = unicode_locking_shift_lookup(&t, c);
754 if (converted == GUND)
755 converted = unicode_single_shift_lookup(&t, c);
757 if (converted & 0x1b00) {
765 in = g_utf8_next_char(in);
772 *items_written = out - res;
776 *items_read = in - text;
781 unsigned char *convert_utf8_to_gsm(const char *text, long len,
782 long *items_read, long *items_written,
783 unsigned char terminator)
785 return convert_utf8_to_gsm_with_lang(text, len, items_read,
789 GSM_DIALECT_DEFAULT);
793 * Converts UTF-8 encoded text to GSM alphabet. It finds an encoding
794 * that uses the minimum set of GSM dialects based on the hint given.
796 * It first attempts to use the default dialect's single shift and
797 * locking shift tables. It then tries with only the single shift
798 * table of the hinted dialect, and finally with both the single shift
799 * and locking shift tables of the hinted dialect.
801 * Returns the encoded data or NULL if no suitable encoding could be
802 * found. The data must be freed by the caller. If items_read is not
803 * NULL, it contains the actual number of bytes read. If items_written
804 * is not NULL, it contains the number of bytes written. If
805 * used_locking and used_single are not NULL, they will contain the
806 * dialects used for the locking shift and single shift tables.
808 unsigned char *convert_utf8_to_gsm_best_lang(const char *utf8, long len,
809 long *items_read, long *items_written,
810 unsigned char terminator,
811 enum gsm_dialect hint,
812 enum gsm_dialect *used_locking,
813 enum gsm_dialect *used_single)
815 enum gsm_dialect locking = GSM_DIALECT_DEFAULT;
816 enum gsm_dialect single = GSM_DIALECT_DEFAULT;
817 unsigned char *encoded;
819 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
820 items_written, terminator,
825 if (hint == GSM_DIALECT_DEFAULT)
829 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
830 items_written, terminator,
835 /* Spanish dialect uses the default locking shift table */
836 if (hint == GSM_DIALECT_SPANISH)
840 encoded = convert_utf8_to_gsm_with_lang(utf8, len, items_read,
841 items_written, terminator,
848 if (used_locking != NULL)
849 *used_locking = locking;
851 if (used_single != NULL)
852 *used_single = single;
858 * Decodes the hex encoded data and converts to a byte array. If terminator
859 * is not 0, the terminator character is appended to the end of the result.
860 * This might be useful for converting GSM encoded data if the CSCS is set
863 * Please note that this since GSM does allow embedded null characeters, use
864 * of the terminator or the items_writen is encouraged to find the real size
867 unsigned char *decode_hex_own_buf(const char *in, long len, long *items_written,
868 unsigned char terminator,
880 for (i = 0, j = 0; i < len; i++, j++) {
883 if (c >= '0' && c <= '9')
885 else if (c >= 'A' && c <= 'F')
894 if (c >= '0' && c <= '9')
895 b = b * 16 + c - '0';
896 else if (c >= 'A' && c <= 'F')
897 b = b * 16 + 10 + c - 'A';
913 unsigned char *decode_hex(const char *in, long len, long *items_written,
914 unsigned char terminator)
925 for (i = 0; i < len; i++) {
928 if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))
934 buf = g_new(unsigned char, (len >> 1) + (terminator ? 1 : 0));
936 return decode_hex_own_buf(in, len, items_written, terminator, buf);
940 * Encodes the data using hexadecimal characters. len can be negative,
941 * in that case the terminator is used to find the last character. This is
942 * useful for handling GSM-encoded strings which allow ASCII NULL character
945 char *encode_hex_own_buf(const unsigned char *in, long len,
946 unsigned char terminator, char *buf)
954 while (in[i] != terminator)
960 for (i = 0, j = 0; i < len; i++, j++) {
961 c = (in[i] >> 4) & 0xf;
966 buf[j] = 'A' + c - 10;
975 buf[j] = 'A' + c - 10;
983 char *encode_hex(const unsigned char *in, long len, unsigned char terminator)
991 while (in[i] != terminator)
997 buf = g_new(char, len * 2 + 1);
999 return encode_hex_own_buf(in, len, terminator, buf);
1002 unsigned char *unpack_7bit_own_buf(const unsigned char *in, long len,
1003 int byte_offset, gboolean ussd,
1004 long max_to_unpack, long *items_written,
1005 unsigned char terminator,
1008 unsigned char rest = 0;
1009 unsigned char *out = buf;
1010 int bits = 7 - (byte_offset % 7);
1016 /* In the case of CB, unpack as much as possible */
1018 max_to_unpack = len * 8 / 7;
1020 for (i = 0; (i < len) && ((out-buf) < max_to_unpack); i++) {
1021 /* Grab what we have in the current octet */
1022 *out = (in[i] & ((1 << bits) - 1)) << (7 - bits);
1024 /* Append what we have from the previous octet, if any */
1027 /* Figure out the remainder */
1028 rest = (in[i] >> bits) & ((1 << (8-bits)) - 1);
1031 * We have the entire character, here we don't increate
1032 * out if this is we started at an offset. Instead
1033 * we effectively populate variable rest
1035 if (i != 0 || bits == 7)
1038 if ((out-buf) == max_to_unpack)
1042 * We expected only 1 bit from this octet, means there's 7
1043 * left, take care of them here
1056 * According to 23.038 6.1.2.3.1, last paragraph:
1057 * "If the total number of characters to be sent equals (8n-1)
1058 * where n=1,2,3 etc. then there are 7 spare bits at the end
1059 * of the message. To avoid the situation where the receiving
1060 * entity confuses 7 binary zero pad bits as the @ character,
1061 * the carriage return or <CR> character shall be used for
1062 * padding in this situation, just as for Cell Broadcast."
1064 * "The receiving entity shall remove the final <CR> character where
1065 * the message ends on an octet boundary with <CR> as the last
1068 if (ussd && (((out - buf) % 8) == 0) && (*(out - 1) == '\r'))
1075 *items_written = out - buf;
1080 unsigned char *unpack_7bit(const unsigned char *in, long len, int byte_offset,
1081 gboolean ussd, long max_to_unpack,
1082 long *items_written, unsigned char terminator)
1084 unsigned char *buf = g_new(unsigned char,
1085 len * 8 / 7 + (terminator ? 1 : 0));
1087 return unpack_7bit_own_buf(in, len, byte_offset, ussd, max_to_unpack,
1088 items_written, terminator, buf);
1091 unsigned char *pack_7bit_own_buf(const unsigned char *in, long len,
1092 int byte_offset, gboolean ussd,
1093 long *items_written,
1094 unsigned char terminator,
1097 int bits = 7 - (byte_offset % 7);
1098 unsigned char *out = buf;
1108 while (in[i] != terminator)
1114 total_bits = len * 7;
1122 for (i = 0; i < len; i++) {
1124 *out |= (in[i] & ((1 << (7 - bits)) - 1)) <<
1129 /* This is a no op when bits == 0, lets keep valgrind happy */
1131 *out = in[i] >> (7 - bits);
1140 * If <CR> is intended to be the last character and the message
1141 * (including the wanted <CR>) ends on an octet boundary, then
1142 * another <CR> must be added together with a padding bit 0. The
1143 * receiving entity will perform the carriage return function twice,
1144 * but this will not result in misoperation as the definition of
1145 * <CR> in clause 6.1.1 is identical to the definition of <CR><CR>.
1147 if (ussd && ((total_bits % 8) == 1))
1153 if (ussd && ((total_bits % 8) == 0) && (in[len - 1] == '\r')) {
1159 *items_written = out - buf;
1164 unsigned char *pack_7bit(const unsigned char *in, long len, int byte_offset,
1165 gboolean ussd, long *items_written,
1166 unsigned char terminator)
1168 int bits = 7 - (byte_offset % 7);
1173 if (len == 0 || items_written == NULL)
1179 while (in[i] != terminator)
1185 total_bits = len * 7;
1190 /* Round up number of bytes, must append <cr> if true */
1191 if (ussd && ((total_bits % 8) == 0) && (in[len - 1] == '\r'))
1192 buf = g_new(unsigned char, (total_bits + 14) / 8);
1194 buf = g_new(unsigned char, (total_bits + 7) / 8);
1196 return pack_7bit_own_buf(in, len, byte_offset, ussd, items_written,
1200 char *sim_string_to_utf8(const unsigned char *buffer, int length)
1202 struct conversion_table t;
1206 unsigned short ucs2_offset;
1212 if (conversion_table_init(&t, GSM_DIALECT_DEFAULT,
1213 GSM_DIALECT_DEFAULT) == FALSE)
1219 if (buffer[0] < 0x80) {
1221 * We have to find the real length, since on SIM file system
1222 * alpha fields are 0xff padded
1224 for (i = 0; i < length; i++)
1225 if (buffer[i] == 0xff)
1228 return convert_gsm_to_utf8(buffer, i, NULL, NULL, 0);
1231 switch (buffer[0]) {
1233 if (((length - 1) % 2) == 1) {
1234 if (buffer[length - 1] != 0xff)
1237 length = length - 1;
1240 for (i = 1; i < length; i += 2)
1241 if (buffer[i] == 0xff && buffer[i + 1] == 0xff)
1244 return g_convert((char *) buffer + 1, i - 1,
1245 "UTF-8//TRANSLIT", "UCS-2BE",
1248 if (length < 3 || (buffer[1] > (length - 3)))
1251 num_chars = buffer[1];
1252 ucs2_offset = buffer[2] << 7;
1257 if (length < 4 || buffer[1] > length - 4)
1260 num_chars = buffer[1];
1261 ucs2_offset = (buffer[2] << 8) | buffer[3];
1273 while ((i < length) && (j < num_chars)) {
1276 if (buffer[i] & 0x80) {
1277 c = (buffer[i++] & 0x7f) + ucs2_offset;
1279 if (c >= 0xd800 && c < 0xe000)
1282 res_len += UTF8_LENGTH(c);
1287 if (buffer[i] == 0x1b) {
1292 c = gsm_single_shift_lookup(&t, buffer[i++]);
1299 c = gsm_locking_shift_lookup(&t, buffer[i++]);
1303 res_len += UTF8_LENGTH(c);
1309 /* Check that the string is padded out to the length by 0xff */
1310 for (; i < length; i++)
1311 if (buffer[i] != 0xff)
1314 utf8 = g_try_malloc(res_len + 1);
1321 while (out < utf8 + res_len) {
1324 if (buffer[i] & 0x80)
1325 c = (buffer[i++] & 0x7f) + ucs2_offset;
1326 else if (buffer[i] == 0x1b) {
1328 c = gsm_single_shift_lookup(&t, buffer[i++]);
1330 c = gsm_locking_shift_lookup(&t, buffer[i++]);
1332 out += g_unichar_to_utf8(c, out);
1340 unsigned char *utf8_to_sim_string(const char *utf, int max_length,
1343 unsigned char *result;
1344 unsigned char *ucs2;
1348 result = convert_utf8_to_gsm(utf, -1, NULL, &gsm_bytes, 0);
1350 if (gsm_bytes > max_length) {
1351 gsm_bytes = max_length;
1352 while (gsm_bytes && result[gsm_bytes - 1] == 0x1b)
1356 *out_length = gsm_bytes;
1360 /* NOTE: UCS2 formats with an offset are never used */
1362 ucs2 = (guint8 *) g_convert(utf, -1, "UCS-2BE//TRANSLIT", "UTF-8",
1363 NULL, &converted, NULL);
1367 if (max_length != -1 && (int) converted + 1 > max_length)
1368 converted = (max_length - 1) & ~1;
1370 result = g_try_malloc(converted + 1);
1371 if (result == NULL) {
1376 *out_length = converted + 1;
1379 memcpy(&result[1], ucs2, converted);
1386 * Converts UCS2 encoded text to GSM alphabet. The result is unpacked,
1387 * with the 7th bit always 0. If terminator is not 0, a terminator character
1388 * is appended to the result.
1390 * Returns the encoded data or NULL if the data could not be encoded. The
1391 * data must be freed by the caller. If items_read is not NULL, it contains
1392 * the actual number of bytes read. If items_written is not NULL, contains
1393 * the number of bytes written.
1395 unsigned char *convert_ucs2_to_gsm_with_lang(const unsigned char *text,
1396 long len, long *items_read,
1397 long *items_written,
1398 unsigned char terminator,
1399 enum gsm_dialect locking_lang,
1400 enum gsm_dialect single_lang)
1402 struct conversion_table t;
1404 const unsigned char *in;
1406 unsigned char *res = NULL;
1410 if (conversion_table_init(&t, locking_lang, single_lang) == FALSE)
1413 if (len < 1 || len % 2)
1419 for (i = 0; i < len; i += 2) {
1420 gunichar c = (in[i] << 8) | in[i + 1];
1421 unsigned short converted = GUND;
1426 converted = unicode_locking_shift_lookup(&t, c);
1428 if (converted == GUND)
1429 converted = unicode_single_shift_lookup(&t, c);
1431 if (converted == GUND)
1434 if (converted & 0x1b00)
1442 res = g_try_malloc(res_len + (terminator ? 1 : 0));
1449 for (i = 0; i < len; i += 2) {
1450 gunichar c = (in[i] << 8) | in[i + 1];
1451 unsigned short converted = GUND;
1453 converted = unicode_locking_shift_lookup(&t, c);
1455 if (converted == GUND)
1456 converted = unicode_single_shift_lookup(&t, c);
1458 if (converted & 0x1b00) {
1471 *items_written = out - res;
1480 unsigned char *convert_ucs2_to_gsm(const unsigned char *text, long len,
1481 long *items_read, long *items_written,
1482 unsigned char terminator)
1484 return convert_ucs2_to_gsm_with_lang(text, len, items_read,
1487 GSM_DIALECT_DEFAULT,
1488 GSM_DIALECT_DEFAULT);