5 #include "linguistics.h"
7 static const char *special[][3]={
8 /* Capital Diacritics */
16 /* ˝ Double Acute Accent */
32 /* ˛ Ogonek (nosinė) */
45 {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
52 /* ˇ Caron (haček, paukščiukas) */
115 /* special letters */
116 {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
119 /* Small Diacritics */
127 /* ˝ Double Acute Accent */
143 /* ˛ Ogonek (nosinė) */
148 /* ˙ Dot (and dotless i) */
163 /* ˇ Caron (haček, paukščiukas) */
227 /* special letters */
234 * @brief Replace special characters in string (e.g. umlauts) with plain letters.
235 * This is useful e.g. to canonicalize a string for comparison.
237 * @param str string to process
238 * @param mode Replacement mode. 0=do nothing, 1=replace with single
239 * ASCII letter, 2=replace with multiple letters if the commonly used
240 * ASCII replacement has multitple letter (e.g. a-umlaut -> ae)
241 * @returns copy of string, with characters replaced
244 linguistics_expand_special(char *str, int mode)
249 out=ret=g_strdup(str);
253 char *next=g_utf8_find_next_char(in, NULL);
257 for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) {
258 const char *search=special[i][0];
259 if (!strncmp(in,search,len)) {
260 const char *replace=special[i][mode];
262 int replace_len=strlen(replace);
263 dbg_assert(replace_len <= len);
264 dbg(1,"found %s %s %d %s %d\n",in,search,len,replace,replace_len);
265 strcpy(out, replace);
290 linguistics_next_word(char *str)
292 int len=strcspn(str, " -/()");
293 if (!str[len] || !str[len+1])
299 linguistics_search(char *str)
301 if (!g_strcasecmp(str,"str"))
303 if (!g_strcasecmp(str,"str."))
305 if (!g_strcasecmp(str,"strasse"))
307 if (!g_strcasecmp(str,"weg"))
313 linguistics_init(void)