1 /* GStreamer language codes and names utility functions
2 * Copyright (C) 2009 Tim-Philipp Müller <tim centricular net>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
21 * SECTION:gsttaglanguagecodes
22 * @short_description: mappings for ISO-639 language codes and names
23 * @see_also: #GstTagList
27 * Provides helper functions to convert between the various ISO-639 language
28 * codes, and to map language codes to language names.
33 /* FIXME 0.11: maybe switch to ISO-639-2 everywhere incl. GST_TAG_LANGUAGE? */
39 #undef GETTEXT_PACKAGE
40 #define GETTEXT_PACKAGE "iso_639"
42 #define ISO_639_XML_PATH ISO_CODES_PREFIX "/share/xml/iso-codes/iso_639.xml"
43 #define ISO_CODES_LOCALEDIR ISO_CODES_PREFIX "/share/locale"
45 #include <gst/gst-i18n-plugin.h>
50 #include "lang-tables.dat"
52 /* FIXME: remove once we depend on GLib >= 2.22 */
53 #if !GLIB_CHECK_VERSION (2, 22, 0)
54 #define g_mapped_file_unref g_mapped_file_free
57 #ifndef GST_DISABLE_GST_DEBUG
59 #define GST_CAT_DEFAULT ensure_debug_category()
61 static GstDebugCategory *
62 ensure_debug_category (void)
64 static gsize cat_gonce = 0;
66 if (g_once_init_enter (&cat_gonce)) {
69 cat_done = (gsize) _gst_debug_category_new ("tag-langcodes", 0,
70 "GstTag language codes and names");
72 g_once_init_leave (&cat_gonce, cat_done);
75 return (GstDebugCategory *) cat_gonce;
80 #define ensure_debug_category() /* NOOP */
82 #endif /* GST_DISABLE_GST_DEBUG */
84 /* ------------------------------------------------------------------------- */
86 /* Loading and initing */
88 #if defined(HAVE_ISO_CODES)
90 get_val (const gchar ** names, const gchar ** vals, const gchar * name)
92 while (names != NULL && *names != NULL) {
93 if (strcmp (*names, name) == 0)
102 parse_start_element (GMarkupParseContext * ctx, const gchar * element_name,
103 const gchar ** attr_names, const gchar ** attr_vals,
104 gpointer user_data, GError ** error)
106 GHashTable *ht = (GHashTable *) user_data;
107 const gchar *c1, *c2t, *c2b, *name, *tname;
109 if (strcmp (element_name, "iso_639_entry") != 0)
112 c1 = get_val (attr_names, attr_vals, "iso_639_1_code");
114 /* only interested in languages with an ISO 639-1 code for now */
118 c2t = get_val (attr_names, attr_vals, "iso_639_2T_code");
119 c2b = get_val (attr_names, attr_vals, "iso_639_2B_code");
120 name = get_val (attr_names, attr_vals, "name");
122 if (c2t == NULL || c2b == NULL || name == NULL) {
123 GST_WARNING ("broken iso_639.xml entry: c2t=%p, c2b=%p, name=%p", c2t,
128 /* translate language name */
131 /* if no translation was found, it will return the input string, which we
132 * we don't want to put into the hash table because it will be freed again */
133 if (G_UNLIKELY (tname == name))
134 tname = g_intern_string (name);
136 /* now overwrite default/fallback mappings with names in locale language */
137 g_hash_table_replace (ht, (gpointer) g_intern_string (c1), (gpointer) tname);
138 g_hash_table_replace (ht, (gpointer) g_intern_string (c2b), (gpointer) tname);
139 if (strcmp (c2t, c2b) != 0) {
140 g_hash_table_replace (ht, (gpointer) g_intern_string (c2t),
144 GST_LOG ("%s %s %s : %s - %s", c1, c2t, c2b, name, tname);
148 gst_tag_load_iso_639_xml (GHashTable * ht)
156 GST_DEBUG ("binding text domain %s to locale dir %s", GETTEXT_PACKAGE,
157 ISO_CODES_LOCALEDIR);
158 bindtextdomain (GETTEXT_PACKAGE, ISO_CODES_LOCALEDIR);
159 bind_textdomain_codeset (GETTEXT_PACKAGE, "UTF-8");
162 f = g_mapped_file_new (ISO_639_XML_PATH, FALSE, NULL);
164 xml_data = (gchar *) g_mapped_file_get_contents (f);
165 xml_len = g_mapped_file_get_length (f);
167 if (!g_file_get_contents (ISO_639_XML_PATH, &xml_data, &xml_len, &err)) {
168 GST_WARNING ("Could not read %s: %s", ISO_639_XML_PATH, err->message);
174 if (g_utf8_validate (xml_data, xml_len, NULL)) {
175 GMarkupParser xml_parser = { parse_start_element, NULL, NULL, NULL, NULL };
176 GMarkupParseContext *ctx;
178 ctx = g_markup_parse_context_new (&xml_parser, 0, ht, NULL);
179 if (!g_markup_parse_context_parse (ctx, xml_data, xml_len, &err)) {
180 GST_WARNING ("Parsing iso_639.xml failed: %s", err->message);
183 g_markup_parse_context_free (ctx);
185 GST_WARNING ("iso_639.xml file is not valid UTF-8");
186 GST_MEMDUMP ("iso_639.xml file", (guint8 *) xml_data, xml_len);
189 /* ... and clean up */
191 g_mapped_file_unref (f);
195 #endif /* HAVE_ISO_CODES */
198 gst_tag_get_iso_639_ht (void)
200 static gsize once_val = 0;
203 if (g_once_init_enter (&once_val)) {
207 GST_MEMDUMP ("iso 639 language names (internal default/fallback)",
208 (guint8 *) iso_639_names, sizeof (iso_639_names));
210 /* maps code -> language name; all strings are either interned strings
211 * or const static strings from lang-table.c */
212 ht = g_hash_table_new (g_str_hash, g_str_equal);
214 /* set up default/fallback mappings */
215 for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
216 GST_LOG ("%3d %s %s %c%c 0x%04x %s", i, iso_639_codes[i].iso_639_1,
217 iso_639_codes[i].iso_639_2,
218 ((iso_639_codes[i].flags & ISO_639_FLAG_2B)) ? 'B' : '.',
219 ((iso_639_codes[i].flags & ISO_639_FLAG_2T)) ? 'T' : '.',
220 iso_639_codes[i].name_offset,
221 iso_639_names + iso_639_codes[i].name_offset);
223 #ifdef HAVE_ISO_CODES
224 /* intern these in order to minimise allocations when interning strings
225 * read from the xml file later */
226 g_intern_static_string (iso_639_codes[i].iso_639_1);
227 g_intern_static_string (iso_639_codes[i].iso_639_2);
228 g_intern_static_string (iso_639_names + iso_639_codes[i].name_offset);
231 /* and add default mapping (these strings are always valid) */
232 g_hash_table_insert (ht, (gpointer) iso_639_codes[i].iso_639_1,
233 (gpointer) (iso_639_names + iso_639_codes[i].name_offset));
234 g_hash_table_insert (ht, (gpointer) iso_639_codes[i].iso_639_2,
235 (gpointer) (iso_639_names + iso_639_codes[i].name_offset));
238 #ifdef HAVE_ISO_CODES
240 GstClockTime ts = gst_util_get_timestamp ();
242 gst_tag_load_iso_639_xml (ht);
244 ts = gst_util_get_timestamp () - ts;
245 GST_INFO ("iso_639.xml loading took %.2gms", (double) ts / GST_MSECOND);
248 GST_INFO ("iso-codes disabled or not available");
251 done_val = (gsize) ht;
252 g_once_init_leave (&once_val, done_val);
255 return (GHashTable *) once_val;
258 /* ------------------------------------------------------------------------- */
261 qsort_strcmp_func (const void *p1, const void *p2)
263 return strcmp (*(char *const *) p1, *(char *const *) p2);
267 * gst_tag_get_language_codes:
269 * Returns a list of known language codes (in form of two-letter ISO-639-1
270 * codes). This is useful for UIs to build a list of available languages for
271 * tagging purposes (e.g. to tag an audio track appropriately in a video or
274 * Returns: NULL-terminated string array with two-letter language codes. Free
275 * with g_strfreev() when no longer needed.
280 gst_tag_get_language_codes (void)
288 ensure_debug_category ();
290 ht = gst_tag_get_iso_639_ht ();
292 /* we have at least two keys for each language (-1 code and -2 code) */
293 codes = g_new (gchar *, (g_hash_table_size (ht) / 2) + 1);
296 g_hash_table_iter_init (&iter, ht);
297 while (g_hash_table_iter_next (&iter, &key, NULL)) {
298 const gchar *lang_code = key;
300 if (strlen (lang_code) == 2) {
301 codes[i] = g_strdup (lang_code);
307 /* be nice and sort the list */
308 qsort (&codes[0], i, sizeof (gchar *), qsort_strcmp_func);
314 * gst_tag_get_language_name:
315 * @language_code: two or three-letter ISO-639 language code
317 * Returns the name of the language given an ISO-639 language code, such
318 * as often found in a GST_TAG_LANGUAGE tag. The name will be translated
319 * according to the current locale (if the library was built against the
320 * iso-codes package, otherwise the English name will be returned).
322 * Language codes are case-sensitive and expected to be lower case.
324 * Returns: language name in UTF-8 format, or NULL if @language_code could
325 * not be mapped to a language name. The returned string must not be
326 * modified and does not need to freed; it will stay valid until the
327 * application is terminated.
332 gst_tag_get_language_name (const gchar * language_code)
334 const gchar *lang_name;
337 g_return_val_if_fail (language_code != NULL, NULL);
339 ensure_debug_category ();
341 ht = gst_tag_get_iso_639_ht ();
343 lang_name = g_hash_table_lookup (ht, (gpointer) language_code);
344 GST_LOG ("%s -> %s", language_code, GST_STR_NULL (lang_name));
350 * gst_tag_get_language_code_iso_639_1:
351 * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
353 * Returns two-letter ISO-639-1 language code given a three-letter ISO-639-2
354 * language code or two-letter ISO-639-1 language code (both are accepted for
357 * Language codes are case-sensitive and expected to be lower case.
359 * Returns: two-letter ISO-639-1 language code string that maps to @lang_code,
360 * or NULL if no mapping is known. The returned string must not be
366 gst_tag_get_language_code_iso_639_1 (const gchar * lang_code)
368 const gchar *c = NULL;
371 g_return_val_if_fail (lang_code != NULL, NULL);
373 ensure_debug_category ();
375 /* FIXME: we are being a bit inconsistent here in the sense that will only
376 * map the language codes from our static table. Theoretically the iso-codes
377 * XML file might have had additional codes that are now in the hash table.
378 * We keep it simple for now and don't waste memory on additional tables. */
379 for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
380 /* we check both codes here, so function can be used in a more versatile
381 * way, to convert a language tag to a two-letter language code and/or
382 * verify an existing code */
383 if (strcmp (lang_code, iso_639_codes[i].iso_639_1) == 0 ||
384 strcmp (lang_code, iso_639_codes[i].iso_639_2) == 0) {
385 c = iso_639_codes[i].iso_639_1;
390 GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));
396 gst_tag_get_language_code_iso_639_2X (const gchar * lang_code, guint8 flags)
400 /* FIXME: we are being a bit inconsistent here in the sense that we will only
401 * map the language codes from our static table. Theoretically the iso-codes
402 * XML file might have had additional codes that are now in the hash table.
403 * We keep it simple for now and don't waste memory on additional tables.
404 * Also, we currently only parse the iso_639.xml file if language names or
405 * a list of all codes is requested, and it'd be nice to keep it like that. */
406 for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
407 /* we check both codes here, so function can be used in a more versatile
408 * way, to convert a language tag to a three-letter language code and/or
409 * verify an existing code */
410 if (strcmp (lang_code, iso_639_codes[i].iso_639_1) == 0 ||
411 strcmp (lang_code, iso_639_codes[i].iso_639_2) == 0) {
412 if ((iso_639_codes[i].flags & flags) == flags) {
413 return iso_639_codes[i].iso_639_2;
414 } else if (i > 0 && (iso_639_codes[i - 1].flags & flags) == flags &&
415 iso_639_codes[i].name_offset == iso_639_codes[i - 1].name_offset) {
416 return iso_639_codes[i - 1].iso_639_2;
417 } else if (i < G_N_ELEMENTS (iso_639_codes) &&
418 (iso_639_codes[i + 1].flags & flags) == flags &&
419 iso_639_codes[i].name_offset == iso_639_codes[i + 1].name_offset) {
420 return iso_639_codes[i + 1].iso_639_2;
428 * gst_tag_get_language_code_iso_639_2T:
429 * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
431 * Returns three-letter ISO-639-2 "terminological" language code given a
432 * two-letter ISO-639-1 language code or a three-letter ISO-639-2 language
433 * code (both are accepted for convenience).
435 * The "terminological" code is derived from the local name of the language
436 * (e.g. "deu" for German instead of "ger"). In most scenarios, the
437 * "terminological" codes are prefered over the "bibliographic" ones.
439 * Language codes are case-sensitive and expected to be lower case.
441 * Returns: three-letter ISO-639-2 language code string that maps to @lang_code,
442 * or NULL if no mapping is known. The returned string must not be
448 gst_tag_get_language_code_iso_639_2T (const gchar * lang_code)
452 g_return_val_if_fail (lang_code != NULL, NULL);
454 ensure_debug_category ();
456 c = gst_tag_get_language_code_iso_639_2X (lang_code, ISO_639_FLAG_2T);
458 GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));
464 * gst_tag_get_language_code_iso_639_2B:
465 * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
467 * Returns three-letter ISO-639-2 "bibliographic" language code given a
468 * two-letter ISO-639-1 language code or a three-letter ISO-639-2 language
469 * code (both are accepted for convenience).
471 * The "bibliographic" code is derived from the English name of the language
472 * (e.g. "ger" for German instead of "de" or "deu"). In most scenarios, the
473 * "terminological" codes are prefered.
475 * Language codes are case-sensitive and expected to be lower case.
477 * Returns: three-letter ISO-639-2 language code string that maps to @lang_code,
478 * or NULL if no mapping is known. The returned string must not be
484 gst_tag_get_language_code_iso_639_2B (const gchar * lang_code)
488 g_return_val_if_fail (lang_code != NULL, NULL);
490 ensure_debug_category ();
492 c = gst_tag_get_language_code_iso_639_2X (lang_code, ISO_639_FLAG_2B);
494 GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));