navit/navit/linguistics.c

   1 #include <string.h>
   2 #include <stdio.h>
   3 #include <glib.h>
   4 #include "debug.h"
   5 #include "linguistics.h"
   6
   7 static const char *special[][3]={
   8 /* Capital Diacritics */
   9 /* ¨ Diaresis */
  10 {"Ä","A","AE"},
  11 {"Ë","E"},
  12 {"Ï","I"},
  13 {"Ö","O","OE"},
  14 {"Ü","U","UE"},
  15 {"Ÿ","Y"},
  16 /* ˝ Double Acute Accent */
  17 {"Ő","O","Ö"},
  18 {"Ű","U","Ü"},
  19 /* ´ Acute Accent */
  20 {"Á","A"},
  21 {"Ć","C"},
  22 {"É","E"},
  23 {"Í","I"},
  24 {"Ĺ","L"},
  25 {"Ń","N"},
  26 {"Ó","O"},
  27 {"Ŕ","R"},
  28 {"Ś","S"},
  29 {"Ú","U"},
  30 {"Ý","Y"},
  31 {"Ź","Z"},
  32 /* ˛ Ogonek (nosinė) */
  33 {"Ą","A"},
  34 {"Ę","E"},
  35 {"Į","I"},
  36 {"Ų","U"},
  37 /* ˙ Dot */
  38 {"Ċ","C"},
  39 {"Ė","E"},
  40 {"Ġ","G"},
  41 {"İ","I"},
  42 {"Ŀ","L"},
  43 {"Ż","Z"},
  44 /* – Stroke */
  45 {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
  46 {"Ħ","H"},
  47 {"Ł","L"},
  48 {"Ŧ","T"},
  49 /* ˚ Ring */
  50 {"Å","A","AA"},
  51 {"Ů","U"},
  52 /* ˇ Caron (haček, paukščiukas) */
  53 {"Č","C"},
  54 {"Ď","D"},
  55 {"Ě","E"},
  56 {"Ľ","L"},
  57 {"Ň","N"},
  58 {"Ř","R"},
  59 {"Š","S"},
  60 {"Ť","T"},
  61 {"Ž","Z"},
  62 /* / Slash */
  63 {"Ø","O","OE"},
  64 /* ¯ Macron */
  65 {"Ā","A","AA"},
  66 {"Ē","E","EE"},
  67 {"Ī","I","II"},
  68 {"Ō","O","OO"},
  69 {"Ū","U","UU"},
  70 /* ˘ Brevis */
  71 {"Ă","A"},
  72 {"Ĕ","E"},
  73 {"Ğ","G"},
  74 {"Ĭ","I"},
  75 {"Ŏ","O"},
  76 {"Ŭ","U"},
  77 /* ^ Circumflex */
  78 {"Â","A"},
  79 {"Ĉ","C"},
  80 {"Ê","E"},
  81 {"Ĝ","G"},
  82 {"Ĥ","H"},
  83 {"Î","I"},
  84 {"Ĵ","J"},
  85 {"Ô","O"},
  86 {"Ŝ","S"},
  87 {"Û","U"},
  88 {"Ŵ","W"},
  89 {"Ŷ","Y"},
  90 /* ¸ Cedilla */
  91 {"Ç","C"},
  92 {"Ģ","G","GJ"},
  93 {"Ķ","K","KJ"},
  94 {"Ļ","L","LJ"},
  95 {"Ņ","N","NJ"},
  96 {"Ŗ","R"},
  97 {"Ş","S"},
  98 {"Ţ","T"},
  99 /* ~ Tilde */
 100 {"Ã","A"},
 101 {"Ĩ","I"},
 102 {"Ñ","N"},
 103 {"Õ","O"},
 104 {"Ũ","U"},
 105 /* ` Grave */
 106 {"À","A"},
 107 {"È","E"},
 108 {"Ì","I"},
 109 {"Ò","O"},
 110 {"Ù","U"},
 111 /* ligatures */
 112 {"Æ","A","AE"},
 113 {"Ĳ","IJ"},
 114 {"Œ","O","OE"},
 115 /* special letters */
 116 {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
 117 {"Ŋ","N","NG"},
 118 {"Þ","T","TH"},
 119 /* Small Diacritics */
 120 /* ¨ Diaresis */
 121 {"ä","a","ae"},
 122 {"ë","e"},
 123 {"ï","i"},
 124 {"ö","o","oe"},
 125 {"ü","u","ue"},
 126 {"ÿ","y"},
 127 /* ˝ Double Acute Accent */
 128 {"ő","o","ö"},
 129 {"ű","u","ü"},
 130 /* ´ Acute Accent */
 131 {"á","a"},
 132 {"ć","c"},
 133 {"é","e"},
 134 {"í","i"},
 135 {"ĺ","l"},
 136 {"ń","n"},
 137 {"ó","o"},
 138 {"ŕ","r"},
 139 {"ś","s"},
 140 {"ú","u"},
 141 {"ý","y"},
 142 {"ź","z"},
 143 /* ˛ Ogonek (nosinė) */
 144 {"ą","a"},
 145 {"ę","e"},
 146 {"į","i"},
 147 {"ų","u"},
 148 /* ˙ Dot (and dotless i) */
 149 {"ċ","c"},
 150 {"ė","e"},
 151 {"ġ","g"},
 152 {"ı","i"},
 153 {"ŀ","l"},
 154 {"ż","z"},
 155 /* – Stroke */
 156 {"đ","d","dj"},
 157 {"ħ","h"},
 158 {"ł","l"},
 159 {"ŧ","t"},
 160 /* ˚ Ring */
 161 {"å","a", "aa"},
 162 {"ů","u"},
 163 /* ˇ Caron (haček, paukščiukas) */
 164 {"č","c"},
 165 {"ď","d"},
 166 {"ě","e"},
 167 {"ľ","l"},
 168 {"ň","n"},
 169 {"ř","r"},
 170 {"š","s"},
 171 {"ť","t"},
 172 {"ž","z"},
 173 /* / Slash */
 174 {"ø","o", "oe"},
 175 /* Macron */
 176 {"ā","a","aa"},
 177 {"ē","e","ee"},
 178 {"ī","i","ii"},
 179 {"ō","o","oo"},
 180 {"ū","u","uu"},
 181 /* ˘ Brevis */
 182 {"ă","a"},
 183 {"ĕ","e"},
 184 {"ğ","g"},
 185 {"ĭ","i"},
 186 {"ŏ","o"},
 187 {"ŭ","u"},
 188 /* ^ Circumflex */
 189 {"â","a"},
 190 {"ĉ","c"},
 191 {"ê","e"},
 192 {"ĝ","g"},
 193 {"ĥ","h"},
 194 {"î","i"},
 195 {"ĵ","j"},
 196 {"ô","o"},
 197 {"ŝ","s"},
 198 {"û","u"},
 199 {"ŵ","w"},
 200 {"ŷ","y"},
 201 /* ¸ Cedilla */
 202 {"ç","c"},
 203 {"ģ","g","gj"},
 204 {"ķ","k","kj"},
 205 {"ļ","l","lj"},
 206 {"ņ","n","nj"},
 207 {"ŗ","r"},
 208 {"ş","s"},
 209 {"ţ","t"},
 210 /* ~ Tilde */
 211 {"ã","a"},
 212 {"ĩ","i"},
 213 {"õ","o"},
 214 {"ñ","n"},
 215 {"ũ","u"},
 216 /* ` Grave */
 217 {"à","a"},
 218 {"è","e"},
 219 {"ì","i"},
 220 {"ò","o"},
 221 {"ù","u"},
 222 /* ligatures */
 223 {"æ","a","ae"},
 224 {"ĳ","ij"},
 225 {"œ","o","oe"},
 226 {"ß","s","ss"},
 227 /* special letters */
 228 {"ð","d","dh"},
 229 {"ŋ","n","ng"},
 230 {"þ","t","th"},
 231 };
 232
 233 /**
 234  * @brief Replace special characters in string (e.g. umlauts) with plain letters.
 235  * This is useful e.g. to canonicalize a string for comparison.
 236  *
 237  * @param str string to process
 238  * @param mode Replacement mode. 0=do nothing, 1=replace with single
 239  * ASCII letter, 2=replace with multiple letters if the commonly used
 240  * ASCII replacement has multitple letter (e.g. a-umlaut -> ae)
 241  * @returns copy of string, with characters replaced
 242  */
 243 char *
 244 linguistics_expand_special(char *str, int mode)
 245 {
 246         char *in=str;
 247         char *out,*ret;
 248         int found=0;
 249         out=ret=g_strdup(str);
 250         if (!mode)
 251                 return ret;
 252         while (*in) {
 253                 char *next=g_utf8_find_next_char(in, NULL);
 254                 int i,len=next-in;
 255                 int match=0;
 256                 if (len > 1) {
 257                         for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) {
 258                                 const char *search=special[i][0];
 259                                 if (!strncmp(in,search,len)) {
 260                                         const char *replace=special[i][mode];
 261                                         if (replace) {
 262                                                 int replace_len=strlen(replace);
 263                                                 dbg_assert(replace_len <= len);
 264                                                 dbg(1,"found %s %s %d %s %d\n",in,search,len,replace,replace_len);
 265                                                 strcpy(out, replace);
 266                                                 out+=replace_len;
 267                                                 match=1;
 268                                                 break;
 269                                         }
 270                                 }
 271                         }
 272                 }
 273                 if (match) {
 274                         found=1;
 275                         in=next;
 276                 } else {
 277                         while (len-- > 0)
 278                                 *out++=*in++;
 279                 }
 280         }
 281         *out++='\0';
 282         if (!found) {
 283                 g_free(ret);
 284                 ret=NULL;
 285         }
 286         return ret;
 287 }
 288
 289 char *
 290 linguistics_next_word(char *str)
 291 {
 292         int len=strcspn(str, " -/()");
 293         if (!str[len] || !str[len+1])
 294                 return NULL;
 295         return str+len+1;
 296 }
 297
 298 int
 299 linguistics_search(char *str)
 300 {
 301         if (!g_strcasecmp(str,"str"))
 302                 return 0;
 303         if (!g_strcasecmp(str,"str."))
 304                 return 0;
 305         if (!g_strcasecmp(str,"strasse"))
 306                 return 0;
 307         if (!g_strcasecmp(str,"weg"))
 308                 return 0;
 309         return 1;
 310 }
 311
 312 void
 313 linguistics_init(void)
 314 {
 315 }