2 * Copyright © 2014 Canonical Limited
4 * SPDX-License-Identifier: LGPL-2.1-or-later
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 * Author: Ryan Lortie <desrt@desrt.ca>
24 #include "gstrfuncs.h"
49 #include "gtranslit-data.h"
51 #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
52 #define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
54 #if G_BYTE_ORDER == G_BIG_ENDIAN
55 #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
57 #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
60 static const gchar * lookup_in_item (guint item_id,
66 compare_mapping_entry (gconstpointer user_data,
69 const struct mapping_entry *entry = data;
70 const gunichar *key = user_data;
73 G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
75 src_0 = get_src_char (src_table, entry->src, 0);
79 else if (key[0] < src_0)
82 if (get_length (entry->src) > 1)
86 src_1 = get_src_char (src_table, entry->src, 1);
90 else if (key[1] < src_1)
100 lookup_in_mapping (const struct mapping_entry *mapping,
106 const struct mapping_entry *hit;
108 hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
113 *key_consumed = get_length (hit->src);
114 *result_len = get_length (hit->ascii);
116 return get_ascii_item(ascii_table, hit->ascii);
120 lookup_in_chain (const guint8 *chain,
127 while (*chain != 0xff)
129 result = lookup_in_item (*chain, key, result_len, key_consumed);
141 lookup_in_item (guint item_id,
148 const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
150 return lookup_in_chain (chain, key, result_len, key_consumed);
154 const struct mapping_range *range = &mapping_ranges[item_id];
156 return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
161 compare_locale_entry (gconstpointer user_data,
164 const struct locale_entry *entry = data;
165 const gchar *key = user_data;
167 return strcmp (key, &locale_names[entry->name_offset]);
171 lookup_item_id_for_one_locale (const gchar *key,
174 const struct locale_entry *hit;
176 hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
181 *item_id = hit->item_id;
186 lookup_item_id_for_locale (const gchar *locale)
188 gchar key[MAX_LOCALE_NAME + 1];
189 const gchar *language;
191 const gchar *territory = NULL;
192 guint territory_len = 0;
193 const gchar *modifier = NULL;
194 guint modifier_len = 0;
195 const gchar *next_char;
198 /* As per POSIX, a valid locale looks like:
200 * language[_territory][.codeset][@modifier]
203 language_len = strcspn (language, "_.@");
204 next_char = language + language_len;
206 if (*next_char == '_')
208 territory = next_char;
209 territory_len = strcspn (territory + 1, "_.@") + 1;
210 next_char = territory + territory_len;
213 if (*next_char == '.')
215 const gchar *codeset;
219 codeset_len = strcspn (codeset + 1, "_.@") + 1;
220 next_char = codeset + codeset_len;
223 if (*next_char == '@')
225 modifier = next_char;
226 modifier_len = strcspn (modifier + 1, "_.@") + 1;
227 next_char = modifier + modifier_len;
230 /* What madness is this? */
231 if (language_len == 0 || *next_char)
232 return default_item_id;
234 /* We are not interested in codeset.
242 * Note: we have no locales of the form aa_BB@cc in the database.
250 if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
252 memcpy (key, language, language_len);
253 memcpy (key + language_len, modifier, modifier_len);
254 key[language_len + modifier_len] = '\0';
256 if (lookup_item_id_for_one_locale (key, &id))
261 if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
263 memcpy (key, language, language_len);
264 memcpy (key + language_len, territory, territory_len);
265 key[language_len + territory_len] = '\0';
267 if (lookup_item_id_for_one_locale (key, &id))
272 if (language_len <= MAX_LOCALE_NAME)
274 memcpy (key, language, language_len);
275 key[language_len] = '\0';
277 if (lookup_item_id_for_one_locale (key, &id))
281 return default_item_id;
285 get_default_item_id (void)
287 static guint item_id;
288 static gboolean done;
290 /* Doesn't need to be locked -- no harm in doing it twice. */
295 locale = setlocale (LC_CTYPE, NULL);
296 item_id = lookup_item_id_for_locale (locale);
305 * @str: a string, in UTF-8
306 * @from_locale: (nullable): the source locale, if known
308 * Transliterate @str to plain ASCII.
310 * For best results, @str should be in composed normalised form.
312 * This function performs a reasonably good set of character
313 * replacements. The particular set of replacements that is done may
314 * change by version or even by runtime environment.
316 * If the source language of @str is known, it can used to improve the
317 * accuracy of the translation by passing it as @from_locale. It should
318 * be a valid POSIX locale string (of the form
319 * `language[_territory][.codeset][@modifier]`).
321 * If @from_locale is %NULL then the current locale is used.
323 * If you want to do translation for no specific locale, and you want it
324 * to be done independently of the currently locale, specify `"C"` for
327 * Returns: a string in plain ASCII
332 g_str_to_ascii (const gchar *str,
333 const gchar *from_locale)
338 g_return_val_if_fail (str != NULL, NULL);
340 if (g_str_is_ascii (str))
341 return g_strdup (str);
344 item_id = lookup_item_id_for_locale (from_locale);
346 item_id = get_default_item_id ();
348 result = g_string_sized_new (strlen (str));
352 /* We only need to transliterate non-ASCII values... */
355 gunichar key[MAX_KEY_SIZE];
361 G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
363 c = g_utf8_get_char (str);
365 /* This is where it gets evil...
367 * We know that MAX_KEY_SIZE is 2. We also know that we
368 * only want to try another character if it's non-ascii.
370 str = g_utf8_next_char (str);
374 key[1] = g_utf8_get_char (str);
378 r = lookup_in_item (item_id, key, &r_len, &consumed);
380 /* If we failed to map two characters, try again with one.
382 * gconv behaviour is a bit weird here -- it seems to
383 * depend in the randomness of the binary search and the
384 * size of the input buffer as to what result we get here.
386 * Doing it this way is more work, but should be
389 if (r == NULL && key[1])
392 r = lookup_in_item (item_id, key, &r_len, &consumed);
397 g_string_append_len (result, r, r_len);
399 /* If it took both then skip again */
400 str = g_utf8_next_char (str);
402 else /* no match found */
403 g_string_append_c (result, '?');
405 else /* ASCII case */
406 g_string_append_c (result, *str++);
409 return g_string_free (result, FALSE);