gettext-tools/src/filter-sr-latin.c

   1 /* Recode Serbian text from Cyrillic to Latin script.
   2    Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
   3    Written by Danilo Šegan <danilo@gnome.org>, 2006,
   4    and Bruno Haible <bruno@clisp.org>, 2006.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include <config.h>
  21 #endif
  22
  23 /* Specification.  */
  24 #include "filters.h"
  25
  26 #include <stdlib.h>
  27
  28 #include "xalloc.h"
  29
  30
  31 /* Table for Serbian Cyrillic to Latin transcription.
  32    The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
  33    The longest table entry is three bytes long.  */
  34 static const char table[240][3 + 1] =
  35 {
  36   /* U+0400 */ "\xC3\x88", /* "È" */
  37   /* U+0401 */ "",
  38   /* U+0402 */ "\xC4\x90", /* "Đ" */
  39   /* U+0403 */ "",
  40   /* U+0404 */ "",
  41   /* U+0405 */ "",
  42   /* U+0406 */ "",
  43   /* U+0407 */ "",
  44   /* U+0408 */ "J",
  45   /* U+0409 */ "Lj",
  46   /* U+040A */ "Nj",
  47   /* U+040B */ "\xC4\x86", /* "Ć" */
  48   /* U+040C */ "",
  49   /* U+040D */ "\xC3\x8C", /* "Ì" */
  50   /* U+040E */ "",
  51   /* U+040F */ "D\xC5\xBE", /* "Dž" */
  52   /* U+0410 */ "A",
  53   /* U+0411 */ "B",
  54   /* U+0412 */ "V",
  55   /* U+0413 */ "G",
  56   /* U+0414 */ "D",
  57   /* U+0415 */ "E",
  58   /* U+0416 */ "\xC5\xBD", /* "Ž" */
  59   /* U+0417 */ "Z",
  60   /* U+0418 */ "I",
  61   /* U+0419 */ "",
  62   /* U+041A */ "K",
  63   /* U+041B */ "L",
  64   /* U+041C */ "M",
  65   /* U+041D */ "N",
  66   /* U+041E */ "O",
  67   /* U+041F */ "P",
  68   /* U+0420 */ "R",
  69   /* U+0421 */ "S",
  70   /* U+0422 */ "T",
  71   /* U+0423 */ "U",
  72   /* U+0424 */ "F",
  73   /* U+0425 */ "H",
  74   /* U+0426 */ "C",
  75   /* U+0427 */ "\xC4\x8C", /* "Č" */
  76   /* U+0428 */ "\xC5\xA0", /* "Š" */
  77   /* U+0429 */ "",
  78   /* U+042A */ "",
  79   /* U+042B */ "",
  80   /* U+042C */ "",
  81   /* U+042D */ "",
  82   /* U+042E */ "",
  83   /* U+042F */ "",
  84   /* U+0430 */ "a",
  85   /* U+0431 */ "b",
  86   /* U+0432 */ "v",
  87   /* U+0433 */ "g",
  88   /* U+0434 */ "d",
  89   /* U+0435 */ "e",
  90   /* U+0436 */ "\xC5\xBE", /* "ž" */
  91   /* U+0437 */ "z",
  92   /* U+0438 */ "i",
  93   /* U+0439 */ "",
  94   /* U+043A */ "k",
  95   /* U+043B */ "l",
  96   /* U+043C */ "m",
  97   /* U+043D */ "n",
  98   /* U+043E */ "o",
  99   /* U+043F */ "p",
 100   /* U+0440 */ "r",
 101   /* U+0441 */ "s",
 102   /* U+0442 */ "t",
 103   /* U+0443 */ "u",
 104   /* U+0444 */ "f",
 105   /* U+0445 */ "h",
 106   /* U+0446 */ "c",
 107   /* U+0447 */ "\xC4\x8D", /* "č" */
 108   /* U+0448 */ "\xC5\xA1", /* "š" */
 109   /* U+0449 */ "",
 110   /* U+044A */ "",
 111   /* U+044B */ "",
 112   /* U+044C */ "",
 113   /* U+044D */ "",
 114   /* U+044E */ "",
 115   /* U+044F */ "",
 116   /* U+0450 */ "\xC3\xA8", /* "è" */
 117   /* U+0451 */ "",
 118   /* U+0452 */ "\xC4\x91", /* "đ" */
 119   /* U+0453 */ "",
 120   /* U+0454 */ "",
 121   /* U+0455 */ "",
 122   /* U+0456 */ "",
 123   /* U+0457 */ "",
 124   /* U+0458 */ "j",
 125   /* U+0459 */ "lj",
 126   /* U+045A */ "nj",
 127   /* U+045B */ "\xC4\x87", /* "ć" */
 128   /* U+045C */ "",
 129   /* U+045D */ "\xC3\xAC", /* "ì" */
 130   /* U+045E */ "",
 131   /* U+045F */ "d\xC5\xBE", /* "dž" */
 132   /* U+0460 */ "",
 133   /* U+0461 */ "",
 134   /* U+0462 */ "",
 135   /* U+0463 */ "",
 136   /* U+0464 */ "",
 137   /* U+0465 */ "",
 138   /* U+0466 */ "",
 139   /* U+0467 */ "",
 140   /* U+0468 */ "",
 141   /* U+0469 */ "",
 142   /* U+046A */ "",
 143   /* U+046B */ "",
 144   /* U+046C */ "",
 145   /* U+046D */ "",
 146   /* U+046E */ "",
 147   /* U+046F */ "",
 148   /* U+0470 */ "",
 149   /* U+0471 */ "",
 150   /* U+0472 */ "",
 151   /* U+0473 */ "",
 152   /* U+0474 */ "",
 153   /* U+0475 */ "",
 154   /* U+0476 */ "",
 155   /* U+0477 */ "",
 156   /* U+0478 */ "",
 157   /* U+0479 */ "",
 158   /* U+047A */ "",
 159   /* U+047B */ "",
 160   /* U+047C */ "",
 161   /* U+047D */ "",
 162   /* U+047E */ "",
 163   /* U+047F */ "",
 164   /* U+0480 */ "",
 165   /* U+0481 */ "",
 166   /* U+0482 */ "",
 167   /* U+0483 */ "",
 168   /* U+0484 */ "",
 169   /* U+0485 */ "",
 170   /* U+0486 */ "",
 171   /* U+0487 */ "",
 172   /* U+0488 */ "",
 173   /* U+0489 */ "",
 174   /* U+048A */ "",
 175   /* U+048B */ "",
 176   /* U+048C */ "",
 177   /* U+048D */ "",
 178   /* U+048E */ "",
 179   /* U+048F */ "",
 180   /* U+0490 */ "",
 181   /* U+0491 */ "",
 182   /* U+0492 */ "",
 183   /* U+0493 */ "",
 184   /* U+0494 */ "",
 185   /* U+0495 */ "",
 186   /* U+0496 */ "",
 187   /* U+0497 */ "",
 188   /* U+0498 */ "",
 189   /* U+0499 */ "",
 190   /* U+049A */ "",
 191   /* U+049B */ "",
 192   /* U+049C */ "",
 193   /* U+049D */ "",
 194   /* U+049E */ "",
 195   /* U+049F */ "",
 196   /* U+04A0 */ "",
 197   /* U+04A1 */ "",
 198   /* U+04A2 */ "",
 199   /* U+04A3 */ "",
 200   /* U+04A4 */ "",
 201   /* U+04A5 */ "",
 202   /* U+04A6 */ "",
 203   /* U+04A7 */ "",
 204   /* U+04A8 */ "",
 205   /* U+04A9 */ "",
 206   /* U+04AA */ "",
 207   /* U+04AB */ "",
 208   /* U+04AC */ "",
 209   /* U+04AD */ "",
 210   /* U+04AE */ "",
 211   /* U+04AF */ "",
 212   /* U+04B0 */ "",
 213   /* U+04B1 */ "",
 214   /* U+04B2 */ "",
 215   /* U+04B3 */ "",
 216   /* U+04B4 */ "",
 217   /* U+04B5 */ "",
 218   /* U+04B6 */ "",
 219   /* U+04B7 */ "",
 220   /* U+04B8 */ "",
 221   /* U+04B9 */ "",
 222   /* U+04BA */ "",
 223   /* U+04BB */ "",
 224   /* U+04BC */ "",
 225   /* U+04BD */ "",
 226   /* U+04BE */ "",
 227   /* U+04BF */ "",
 228   /* U+04C0 */ "",
 229   /* U+04C1 */ "",
 230   /* U+04C2 */ "",
 231   /* U+04C3 */ "",
 232   /* U+04C4 */ "",
 233   /* U+04C5 */ "",
 234   /* U+04C6 */ "",
 235   /* U+04C7 */ "",
 236   /* U+04C8 */ "",
 237   /* U+04C9 */ "",
 238   /* U+04CA */ "",
 239   /* U+04CB */ "",
 240   /* U+04CC */ "",
 241   /* U+04CD */ "",
 242   /* U+04CE */ "",
 243   /* U+04CF */ "",
 244   /* U+04D0 */ "",
 245   /* U+04D1 */ "",
 246   /* U+04D2 */ "",
 247   /* U+04D3 */ "",
 248   /* U+04D4 */ "",
 249   /* U+04D5 */ "",
 250   /* U+04D6 */ "",
 251   /* U+04D7 */ "",
 252   /* U+04D8 */ "",
 253   /* U+04D9 */ "",
 254   /* U+04DA */ "",
 255   /* U+04DB */ "",
 256   /* U+04DC */ "",
 257   /* U+04DD */ "",
 258   /* U+04DE */ "",
 259   /* U+04DF */ "",
 260   /* U+04E0 */ "",
 261   /* U+04E1 */ "",
 262   /* U+04E2 */ "\xC4\xAA", /* "Ī" */
 263   /* U+04E3 */ "\xC4\xAB", /* "ī" */
 264   /* U+04E4 */ "",
 265   /* U+04E5 */ "",
 266   /* U+04E6 */ "",
 267   /* U+04E7 */ "",
 268   /* U+04E8 */ "",
 269   /* U+04E9 */ "",
 270   /* U+04EA */ "",
 271   /* U+04EB */ "",
 272   /* U+04EC */ "",
 273   /* U+04ED */ "",
 274   /* U+04EE */ "\xC5\xAA", /* "Ū" */
 275   /* U+04EF */ "\xC5\xAB" /* "ū" */
 276 };
 277
 278 /* Quick test for an uppercase character in the range U+0041..U+005A.
 279    The argument must be a byte in the range 0..UCHAR_MAX.  */
 280 #define IS_UPPERCASE_LATIN(byte) \
 281   ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
 282
 283 /* Quick test for an uppercase character in the range U+0400..U+042F,
 284    or exactly U+04E2 or U+04EE.
 285    The arguments must be bytes in the range 0..UCHAR_MAX.  */
 286 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
 287   (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
 288    || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
 289
 290 void
 291 serbian_to_latin (const char *input, size_t input_len,
 292                   char **output_p, size_t *output_len_p)
 293 {
 294   /* Loop through the input string, producing a replacement for each character.
 295      Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
 296      be handled, and more precisely only those for which a replacement exists
 297      in the table.  Other characters are copied without modification.
 298      The characters U+0409, U+040A, U+040F are transliterated to uppercase or
 299      mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
 300      on the case of the surrounding characters.
 301      Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
 302      beginning of a character; the second and further bytes of a character are
 303      all in the range \x80..\xBF.  */
 304
 305   /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
 306      the size of the output will be at most 1.5 * input_len.  */
 307   size_t allocated = input_len + (input_len >> 1);
 308   char *output = XNMALLOC (allocated, char);
 309
 310   const char *input_end = input + input_len;
 311   const char *ip;
 312   char *op;
 313
 314   for (ip = input, op = output; ip < input_end; )
 315     {
 316       unsigned char byte = (unsigned char) *ip;
 317
 318       /* Test for the first byte of a Cyrillic character.  */
 319       if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
 320         {
 321           unsigned char second_byte = (unsigned char) ip[1];
 322
 323           /* Verify the second byte is valid.  */
 324           if (second_byte >= 0x80 && second_byte < 0xc0)
 325             {
 326               unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
 327
 328               if (uc >= 0x0400 && uc <= 0x04ef)
 329                 {
 330                   /* Look up replacement from the table.  */
 331                   const char *repl = table[uc - 0x0400];
 332
 333                   if (repl[0] != '\0')
 334                     {
 335                       /* Found a replacement.
 336                          Now handle the special cases.  */
 337                       if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
 338                         if ((ip + 2 < input_end
 339                              && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
 340                             || (ip + 3 < input_end
 341                                 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
 342                                                           (unsigned char) ip[3]))
 343                             || (ip >= input + 1
 344                                 && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
 345                             || (ip >= input + 2
 346                                 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
 347                                                           (unsigned char) ip[-1])))
 348                           {
 349                             /* Use the upper-case replacement instead of
 350                                the mixed-case replacement.  */
 351                             switch (uc)
 352                               {
 353                               case 0x0409:
 354                                 repl = "LJ"; break;
 355                               case 0x040a:
 356                                 repl = "NJ"; break;
 357                               case 0x040f:
 358                                 repl = "D\xC5\xBD"/* "DŽ" */; break;
 359                               default:
 360                                 abort ();
 361                               }
 362                           }
 363
 364                       /* Use the replacement.  */
 365                       *op++ = *repl++;
 366                       if (*repl != '\0')
 367                         {
 368                           *op++ = *repl++;
 369                           if (*repl != '\0')
 370                             {
 371                               *op++ = *repl++;
 372                               /* All replacements have at most 3 bytes.  */
 373                               if (*repl != '\0')
 374                                 abort ();
 375                             }
 376                         }
 377                       ip += 2;
 378                       continue;
 379                     }
 380                 }
 381             }
 382         }
 383       *op++ = *ip++;
 384     }
 385
 386   {
 387     size_t output_len = op - output;
 388
 389     /* Verify that the allocated size was not exceeded.  */
 390     if (output_len > allocated)
 391       abort ();
 392     /* Shrink the result.  */
 393     if (output_len < allocated)
 394       output = (char *) xrealloc (output, output_len);
 395
 396     /* Done.  */
 397     *output_p = output;
 398     *output_len_p = output_len;
 399   }
 400 }