From 2ecb455728cad980aee77a300cc9584858e26c13 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Tim-Philipp=20M=C3=BCller?= Date: Fri, 24 Mar 2006 17:57:39 +0000 Subject: [PATCH] gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'... Original commit message from CVS: * gst/subparse/gstsubparse.c: (convert_encoding), (gst_sub_parse_change_state): * gst/subparse/gstsubparse.h: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?' characters in place of non-ASCII characters like accented characters. So let's assume the input is UTF-8 until we come across text that is clearly not. If it's not UTF-8, we don't really know what it is, so try the following: (a) see whether the GST_SUBTITLE_ENCODING environment variable is set; if not, check (b) if the current locale encoding is non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if the current locale encoding is UTF-8 and the environment variable was not set to any particular encoding. Not perfect, but better than nothing (and better than before, I think) (fixes #172848). --- ChangeLog | 17 +++++++++++++ gst/subparse/gstsubparse.c | 62 ++++++++++++++++++++++++++-------------------- gst/subparse/gstsubparse.h | 1 + 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index 425b56a..d163f1a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2006-03-24 Tim-Philipp Müller + + * gst/subparse/gstsubparse.c: (convert_encoding), + (gst_sub_parse_change_state): + * gst/subparse/gstsubparse.h: + Text subtitle files may or may not be UTF-8. If it's not, we + don't really want to see '?' characters in place of non-ASCII + characters like accented characters. So let's assume the input + is UTF-8 until we come across text that is clearly not. If it's + not UTF-8, we don't really know what it is, so try the following: + (a) see whether the GST_SUBTITLE_ENCODING environment variable + is set; if not, check (b) if the current locale encoding is + non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if + the current locale encoding is UTF-8 and the environment variable + was not set to any particular encoding. Not perfect, but better + than nothing (and better than before, I think) (fixes #172848). + 2006-03-24 Thomas Vander Stichele * configure.ac: diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index 9dd1acc..34b874f 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -230,38 +230,45 @@ beach: static gchar * convert_encoding (GstSubParse * self, const gchar * str, gsize len) { - gsize bytes_read, bytes_written; - gchar *rv; - GString *converted; + const gchar *encoding; + GError *err = NULL; + gchar *ret; - converted = g_string_new (NULL); - while (len) { -#ifndef GST_DISABLE_GST_DEBUG - gchar *dbg = g_strndup (str, len); + if (self->valid_utf8) { + if (g_utf8_validate (str, len, NULL)) { + GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed"); + return g_strndup (str, len); + } + GST_INFO_OBJECT (self, "invalid UTF-8!"); + self->valid_utf8 = FALSE; + } - GST_DEBUG ("Trying to convert '%s'", dbg); - g_free (dbg); -#endif + encoding = g_getenv ("GST_SUBTITLE_ENCODING"); + if (encoding == NULL || *encoding == '\0') { + /* if local encoding is UTF-8 and no encoding specified + * via the environment variable, assume ISO-8859-15 */ + if (g_get_charset (&encoding)) { + encoding = "ISO-8859-15"; + } + } - rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL); - if (rv) { - g_string_append_len (converted, rv, bytes_written); - g_free (rv); + ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL, + NULL, &err); - len -= bytes_read; - str += bytes_read; - } - if (len) { - /* conversion error ocurred => skip one char */ - len--; - str++; - g_string_append_c (converted, '?'); - } + if (err) { + GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s", + encoding, err->message); + g_error_free (err); + + /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */ + ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*", + NULL, NULL, NULL); } - rv = converted->str; - g_string_free (converted, FALSE); - GST_DEBUG ("Converted to '%s'", rv); - return rv; + + GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8" + "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : ""); + + return ret; } static gchar * @@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition) /* format detection will init the parser state */ self->offset = self->next_offset = 0; self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN; + self->valid_utf8 = TRUE; break; default: break; diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h index 94b0f3a..6757aad 100644 --- a/gst/subparse/gstsubparse.h +++ b/gst/subparse/gstsubparse.h @@ -81,6 +81,7 @@ struct _GstSubParse { gboolean need_segment; gboolean flushing; + gboolean valid_utf8; }; struct _GstSubParseClass { -- 2.7.4