+2006-03-24 Tim-Philipp Müller <tim at centricular dot net>
+
+ * gst/subparse/gstsubparse.c: (convert_encoding),
+ (gst_sub_parse_change_state):
+ * gst/subparse/gstsubparse.h:
+ Text subtitle files may or may not be UTF-8. If it's not, we
+ don't really want to see '?' characters in place of non-ASCII
+ characters like accented characters. So let's assume the input
+ is UTF-8 until we come across text that is clearly not. If it's
+ not UTF-8, we don't really know what it is, so try the following:
+ (a) see whether the GST_SUBTITLE_ENCODING environment variable
+ is set; if not, check (b) if the current locale encoding is
+ non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
+ the current locale encoding is UTF-8 and the environment variable
+ was not set to any particular encoding. Not perfect, but better
+ than nothing (and better than before, I think) (fixes #172848).
+
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
* configure.ac:
static gchar *
convert_encoding (GstSubParse * self, const gchar * str, gsize len)
{
- gsize bytes_read, bytes_written;
- gchar *rv;
- GString *converted;
+ const gchar *encoding;
+ GError *err = NULL;
+ gchar *ret;
- converted = g_string_new (NULL);
- while (len) {
-#ifndef GST_DISABLE_GST_DEBUG
- gchar *dbg = g_strndup (str, len);
+ if (self->valid_utf8) {
+ if (g_utf8_validate (str, len, NULL)) {
+ GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
+ return g_strndup (str, len);
+ }
+ GST_INFO_OBJECT (self, "invalid UTF-8!");
+ self->valid_utf8 = FALSE;
+ }
- GST_DEBUG ("Trying to convert '%s'", dbg);
- g_free (dbg);
-#endif
+ encoding = g_getenv ("GST_SUBTITLE_ENCODING");
+ if (encoding == NULL || *encoding == '\0') {
+ /* if local encoding is UTF-8 and no encoding specified
+ * via the environment variable, assume ISO-8859-15 */
+ if (g_get_charset (&encoding)) {
+ encoding = "ISO-8859-15";
+ }
+ }
- rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
- if (rv) {
- g_string_append_len (converted, rv, bytes_written);
- g_free (rv);
+ ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
+ NULL, &err);
- len -= bytes_read;
- str += bytes_read;
- }
- if (len) {
- /* conversion error ocurred => skip one char */
- len--;
- str++;
- g_string_append_c (converted, '?');
- }
+ if (err) {
+ GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
+ encoding, err->message);
+ g_error_free (err);
+
+ /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
+ ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
+ NULL, NULL, NULL);
}
- rv = converted->str;
- g_string_free (converted, FALSE);
- GST_DEBUG ("Converted to '%s'", rv);
- return rv;
+
+ GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
+ "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
+
+ return ret;
}
static gchar *
/* format detection will init the parser state */
self->offset = self->next_offset = 0;
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
+ self->valid_utf8 = TRUE;
break;
default:
break;