1 /* Recode Serbian text from Cyrillic to Latin script.
2 Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
3 Written by Danilo Šegan <danilo@gnome.org>, 2006,
4 and Bruno Haible <bruno@clisp.org>, 2006.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
31 /* Table for Serbian Cyrillic to Latin transcription.
32 The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
33 The longest table entry is three bytes long. */
34 static const char table[240][3 + 1] =
36 /* U+0400 */ "\xC3\x88", /* "È" */
38 /* U+0402 */ "\xC4\x90", /* "Đ" */
47 /* U+040B */ "\xC4\x86", /* "Ć" */
49 /* U+040D */ "\xC3\x8C", /* "Ì" */
51 /* U+040F */ "D\xC5\xBE", /* "Dž" */
58 /* U+0416 */ "\xC5\xBD", /* "Ž" */
75 /* U+0427 */ "\xC4\x8C", /* "Č" */
76 /* U+0428 */ "\xC5\xA0", /* "Š" */
90 /* U+0436 */ "\xC5\xBE", /* "ž" */
107 /* U+0447 */ "\xC4\x8D", /* "č" */
108 /* U+0448 */ "\xC5\xA1", /* "š" */
116 /* U+0450 */ "\xC3\xA8", /* "è" */
118 /* U+0452 */ "\xC4\x91", /* "đ" */
127 /* U+045B */ "\xC4\x87", /* "ć" */
129 /* U+045D */ "\xC3\xAC", /* "ì" */
131 /* U+045F */ "d\xC5\xBE", /* "dž" */
262 /* U+04E2 */ "\xC4\xAA", /* "Ī" */
263 /* U+04E3 */ "\xC4\xAB", /* "ī" */
274 /* U+04EE */ "\xC5\xAA", /* "Ū" */
275 /* U+04EF */ "\xC5\xAB" /* "ū" */
278 /* Quick test for an uppercase character in the range U+0041..U+005A.
279 The argument must be a byte in the range 0..UCHAR_MAX. */
280 #define IS_UPPERCASE_LATIN(byte) \
281 ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
283 /* Quick test for an uppercase character in the range U+0400..U+042F,
284 or exactly U+04E2 or U+04EE.
285 The arguments must be bytes in the range 0..UCHAR_MAX. */
286 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
287 (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
288 || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
291 serbian_to_latin (const char *input, size_t input_len,
292 char **output_p, size_t *output_len_p)
294 /* Loop through the input string, producing a replacement for each character.
295 Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
296 be handled, and more precisely only those for which a replacement exists
297 in the table. Other characters are copied without modification.
298 The characters U+0409, U+040A, U+040F are transliterated to uppercase or
299 mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
300 on the case of the surrounding characters.
301 Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
302 beginning of a character; the second and further bytes of a character are
303 all in the range \x80..\xBF. */
305 /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
306 the size of the output will be at most 1.5 * input_len. */
307 size_t allocated = input_len + (input_len >> 1);
308 char *output = XNMALLOC (allocated, char);
310 const char *input_end = input + input_len;
314 for (ip = input, op = output; ip < input_end; )
316 unsigned char byte = (unsigned char) *ip;
318 /* Test for the first byte of a Cyrillic character. */
319 if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
321 unsigned char second_byte = (unsigned char) ip[1];
323 /* Verify the second byte is valid. */
324 if (second_byte >= 0x80 && second_byte < 0xc0)
326 unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
328 if (uc >= 0x0400 && uc <= 0x04ef)
330 /* Look up replacement from the table. */
331 const char *repl = table[uc - 0x0400];
335 /* Found a replacement.
336 Now handle the special cases. */
337 if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
338 if ((ip + 2 < input_end
339 && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
340 || (ip + 3 < input_end
341 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
342 (unsigned char) ip[3]))
344 && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
346 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
347 (unsigned char) ip[-1])))
349 /* Use the upper-case replacement instead of
350 the mixed-case replacement. */
358 repl = "D\xC5\xBD"/* "DŽ" */; break;
364 /* Use the replacement. */
372 /* All replacements have at most 3 bytes. */
387 size_t output_len = op - output;
389 /* Verify that the allocated size was not exceeded. */
390 if (output_len > allocated)
392 /* Shrink the result. */
393 if (output_len < allocated)
394 output = (char *) xrealloc (output, output_len);
398 *output_len_p = output_len;