1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001, 2003, 2006,
3 * 2008, 2010 Free Software Foundation, Inc.
5 * This file is part of GnuPG.
7 * GnuPG is free software; you can redistribute it and/or modify it
8 * under the terms of either
10 * - the GNU Lesser General Public License as published by the Free
11 * Software Foundation; either version 3 of the License, or (at
12 * your option) any later version.
16 * - the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at
18 * your option) any later version.
20 * or both in parallel, as here.
22 * GnuPG is distributed in the hope that it will be useful, but
23 * WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 * General Public License for more details.
27 * You should have received a copies of the GNU General Public License
28 * and the GNU Lesser General Public License along with this program;
29 * if not, see <http://www.gnu.org/licenses/>.
37 #ifdef HAVE_LANGINFO_CODESET
41 #ifndef HAVE_ANDROID_SYSTEM
46 #include "common-defs.h"
48 #include "stringhelp.h"
55 static const char *active_charset_name = "iso-8859-1";
56 static int no_translation; /* Set to true if we let simply pass through. */
57 static int use_iconv; /* iconv conversion functions required. */
60 #ifdef HAVE_ANDROID_SYSTEM
61 /* Fake stuff to get things building. */
62 typedef void *iconv_t;
66 iconv_open (const char *tocode, const char *fromcode)
74 iconv (iconv_t cd, char **inbuf, size_t *inbytesleft,
75 char **outbuf, size_t *outbytesleft)
86 iconv_close (iconv_t cd)
91 #endif /*HAVE_ANDROID_SYSTEM*/
94 /* Error handler for iconv failures. This is needed to not clutter the
95 output with repeated diagnostics about a missing conversion. */
97 handle_iconv_error (const char *to, const char *from, int use_fallback)
101 static int shown1, shown2;
104 if (to && !strcmp (to, "utf-8"))
116 log_info (_("conversion from '%s' to '%s' not available\n"),
124 log_info (_("iconv_open failed: %s\n"), strerror (errno));
130 /* To avoid further error messages we fallback to UTF-8 for the
131 native encoding. Nowadays this seems to be the best bet in
132 case of errors from iconv or nl_langinfo. */
133 active_charset_name = "utf-8";
142 set_native_charset (const char *newset)
144 const char *full_newset;
148 #ifdef HAVE_ANDROID_SYSTEM
150 #elif defined HAVE_W32_SYSTEM
151 static char codepage[30];
155 /* We are a console program thus we need to use the
156 GetConsoleOutputCP function and not the the GetACP which
157 would give the codepage for a GUI program. Note this is not
158 a bulletproof detection because GetConsoleCP might return a
159 different one for console input. Not sure how to cope with
160 that. If the console Code page is not known we fall back to
161 the system code page. */
162 #ifndef HAVE_W32CE_SYSTEM
163 cpno = GetConsoleOutputCP ();
167 sprintf (codepage, "CP%u", cpno );
168 /* Resolve alias. We use a long string string and not the usual
169 array to optimize if the code is taken to a DSO. Taken from
172 for (aliases = ("CP936" "\0" "GBK" "\0"
173 "CP1361" "\0" "JOHAB" "\0"
174 "CP20127" "\0" "ASCII" "\0"
175 "CP20866" "\0" "KOI8-R" "\0"
176 "CP21866" "\0" "KOI8-RU" "\0"
177 "CP28591" "\0" "ISO-8859-1" "\0"
178 "CP28592" "\0" "ISO-8859-2" "\0"
179 "CP28593" "\0" "ISO-8859-3" "\0"
180 "CP28594" "\0" "ISO-8859-4" "\0"
181 "CP28595" "\0" "ISO-8859-5" "\0"
182 "CP28596" "\0" "ISO-8859-6" "\0"
183 "CP28597" "\0" "ISO-8859-7" "\0"
184 "CP28598" "\0" "ISO-8859-8" "\0"
185 "CP28599" "\0" "ISO-8859-9" "\0"
186 "CP28605" "\0" "ISO-8859-15" "\0"
187 "CP65001" "\0" "UTF-8" "\0");
189 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
191 if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
193 newset = aliases + strlen (aliases) + 1;
198 #else /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/
200 #ifdef HAVE_LANGINFO_CODESET
201 newset = nl_langinfo (CODESET);
202 #else /*!HAVE_LANGINFO_CODESET*/
203 /* Try to get the used charset from environment variables. */
204 static char codepage[30];
205 const char *lc, *dot, *mod;
207 strcpy (codepage, "iso-8859-1");
208 lc = getenv ("LC_ALL");
211 lc = getenv ("LC_CTYPE");
213 lc = getenv ("LANG");
217 dot = strchr (lc, '.');
220 mod = strchr (++dot, '@');
222 mod = dot + strlen (dot);
223 if (mod - dot < sizeof codepage && dot != mod)
225 memcpy (codepage, dot, mod - dot);
226 codepage [mod - dot] = 0;
231 #endif /*!HAVE_LANGINFO_CODESET*/
232 #endif /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/
235 full_newset = newset;
236 if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
239 if (*newset == '-' || *newset == '_')
243 /* Note that we silently assume that plain ASCII is actually meant
244 as Latin-1. This makes sense because many Unix system don't have
245 their locale set up properly and thus would get annoying error
246 messages and we have to handle all the "bug" reports. Latin-1 has
247 always been the character set used for 8 bit characters on Unix
250 || !ascii_strcasecmp (newset, "8859-1" )
251 || !ascii_strcasecmp (newset, "646" )
252 || !ascii_strcasecmp (newset, "ASCII" )
253 || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
256 active_charset_name = "iso-8859-1";
260 else if ( !ascii_strcasecmp (newset, "utf8" )
261 || !ascii_strcasecmp(newset, "utf-8") )
263 active_charset_name = "utf-8";
271 cd = iconv_open (full_newset, "utf-8");
272 if (cd == (iconv_t)-1)
274 handle_iconv_error (full_newset, "utf-8", 0);
278 cd = iconv_open ("utf-8", full_newset);
279 if (cd == (iconv_t)-1)
281 handle_iconv_error ("utf-8", full_newset, 0);
285 active_charset_name = full_newset;
293 get_native_charset ()
295 return active_charset_name;
298 /* Return true if the native charset is utf-8. */
300 is_native_utf8 (void)
302 return no_translation;
306 /* Convert string, which is in native encoding to UTF8 and return a
307 new allocated UTF-8 string. This function terminates the process
308 on memory shortage. */
310 native_to_utf8 (const char *orig_string)
312 const unsigned char *string = (const unsigned char *)orig_string;
313 const unsigned char *s;
320 /* Already utf-8 encoded. */
321 buffer = xstrdup (orig_string);
325 /* For Latin-1 we can avoid the iconv overhead. */
326 for (s = string; *s; s++)
332 buffer = xmalloc (length + 1);
333 for (p = (unsigned char *)buffer, s = string; *s; s++)
337 *p++ = 0xc0 | ((*s >> 6) & 3);
338 *p++ = 0x80 | (*s & 0x3f);
347 /* Need to use iconv. */
351 size_t inbytes, outbytes;
353 cd = iconv_open ("utf-8", active_charset_name);
354 if (cd == (iconv_t)-1)
356 handle_iconv_error ("utf-8", active_charset_name, 1);
357 return native_to_utf8 (string);
360 for (s=string; *s; s++ )
364 length += 5; /* We may need up to 6 bytes for the utf8 output. */
366 buffer = xmalloc (length + 1);
369 inbytes = strlen (string);
372 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
373 &outptr, &outbytes) == (size_t)-1)
378 log_info (_("conversion from '%s' to '%s' failed: %s\n"),
379 active_charset_name, "utf-8", strerror (errno));
381 /* We don't do any conversion at all but use the strings as is. */
382 strcpy (buffer, string);
387 /* We could realloc the buffer now but I doubt that it makes
388 much sense given that it will get freed anyway soon
399 do_utf8_to_native (const char *string, size_t length, int delim,
404 unsigned char encbuf[8];
406 const unsigned char *s;
410 unsigned long val = 0;
414 /* First pass (p==NULL): count the extended utf-8 characters. */
415 /* Second pass (p!=NULL): create string. */
418 for (slen = length, nleft = encidx = 0, n = 0,
419 s = (const unsigned char *)string;
425 if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
430 sprintf (p, "\\x%02x", *s);
444 && (*s < 0x20 || *s == 0x7f || *s == delim
445 || (delim && *s == '\\')))
452 case '\n': n++; if ( p ) *p++ = 'n'; break;
453 case '\r': n++; if ( p ) *p++ = 'r'; break;
454 case '\f': n++; if ( p ) *p++ = 'f'; break;
455 case '\v': n++; if ( p ) *p++ = 'v'; break;
456 case '\b': n++; if ( p ) *p++ = 'b'; break;
457 case 0: n++; if ( p ) *p++ = '0'; break;
462 sprintf (p, "x%02x", *s);
475 else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
480 encbuf[encidx++] = *s;
482 else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
487 encbuf[encidx++] = *s;
489 else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
494 encbuf[encidx++] = *s;
496 else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
501 encbuf[encidx++] = *s;
503 else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
508 encbuf[encidx++] = *s;
510 else /* Invalid encoding: print as \xNN. */
514 sprintf (p, "\\x%02x", *s);
521 else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
525 for (i = 0; i < encidx; i++)
527 sprintf (p, "\\x%02x", encbuf[i]);
530 sprintf (p, "\\x%02x", *s);
540 encbuf[encidx++] = *s;
543 if (!--nleft) /* Ready. */
549 for (i = 0; i < encidx; i++)
557 /* Our strategy for using iconv is a bit strange
558 but it better keeps compatibility with
559 previous versions in regard to how invalid
560 encodings are displayed. What we do is to
561 keep the utf-8 as is and have the real
562 translation step then at the end. Yes, I
563 know that this is ugly. However we are short
564 of the 1.4 release and for this branch we
565 should not mess too much around with iconv
566 things. One reason for this is that we don't
567 know enough about non-GNU iconv
568 implementation and want to minimize the risk
569 of breaking the code on too many platforms. */
572 for (i=0; i < encidx; i++ )
578 else /* Latin-1 case. */
580 if (val >= 0x80 && val < 256)
582 /* We can simply print this character */
589 /* We do not have a translation: print utf8. */
592 for (i = 0; i < encidx; i++)
594 sprintf (p, "\\x%02x", encbuf[i]);
608 /* Allocate the buffer after the first pass. */
609 buffer = p = xmalloc (n + 1);
613 /* Note: See above for comments. */
616 char *outbuf, *outptr;
617 size_t inbytes, outbytes;
619 *p = 0; /* Terminate the buffer. */
621 cd = iconv_open (active_charset_name, "utf-8");
622 if (cd == (iconv_t)-1)
624 handle_iconv_error (active_charset_name, "utf-8", 1);
626 return utf8_to_native (string, length, delim);
629 /* Allocate a new buffer large enough to hold all possible
634 outbytes = n * MB_LEN_MAX;
635 if (outbytes / MB_LEN_MAX != n)
636 BUG (); /* Actually an overflow. */
637 outbuf = outptr = xmalloc (outbytes);
638 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
639 &outptr, &outbytes) == (size_t)-1)
644 log_info (_("conversion from '%s' to '%s' failed: %s\n"),
645 "utf-8", active_charset_name, strerror (errno));
647 /* Didn't worked out. Try again but without iconv. */
651 outbuf = do_utf8_to_native (string, length, delim, 0);
655 *outptr = 0; /* Make sure it is a string. */
656 /* We could realloc the buffer now but I doubt that it
657 makes much sense given that it will get freed
658 anyway soon after. */
664 else /* Not using iconv. */
666 *p = 0; /* Make sure it is a string. */
672 /* Convert string, which is in UTF-8 to native encoding. Replace
673 illegal encodings by some "\xnn" and quote all control
674 characters. A character with value DELIM will always be quoted, it
675 must be a vanilla ASCII character. A DELIM value of -1 is special:
676 it disables all quoting of control characters. This function
677 terminates the process on memory shortage. */
679 utf8_to_native (const char *string, size_t length, int delim)
681 return do_utf8_to_native (string, length, delim, use_iconv);
687 /* Wrapper function for iconv_open, required for W32 as we dlopen that
688 library on that system. */
690 jnlib_iconv_open (const char *tocode, const char *fromcode)
692 return (jnlib_iconv_t)iconv_open (tocode, fromcode);
696 /* Wrapper function for iconv, required for W32 as we dlopen that
697 library on that system. */
699 jnlib_iconv (jnlib_iconv_t cd,
700 const char **inbuf, size_t *inbytesleft,
701 char **outbuf, size_t *outbytesleft)
703 return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft);
706 /* Wrapper function for iconv_close, required for W32 as we dlopen that
707 library on that system. */
709 jnlib_iconv_close (jnlib_iconv_t cd)
711 return iconv_close ((iconv_t)cd);
715 #ifdef HAVE_W32_SYSTEM
716 /* Return a malloced string encoded in UTF-8 from the wide char input
717 string STRING. Caller must free this value. Returns NULL and sets
718 ERRNO on failure. Calling this function with STRING set to NULL is
721 wchar_to_utf8 (const wchar_t *string)
726 n = WideCharToMultiByte (CP_UTF8, 0, string, -1, NULL, 0, NULL, NULL);
729 gpg_err_set_errno (EINVAL);
733 result = xtrymalloc (n+1);
737 n = WideCharToMultiByte (CP_UTF8, 0, string, -1, result, n, NULL, NULL);
741 gpg_err_set_errno (EINVAL);
748 /* Return a malloced wide char string from an UTF-8 encoded input
749 string STRING. Caller must free this value. Returns NULL and sets
750 ERRNO on failure. Calling this function with STRING set to NULL is
753 utf8_to_wchar (const char *string)
759 n = MultiByteToWideChar (CP_UTF8, 0, string, -1, NULL, 0);
762 gpg_err_set_errno (EINVAL);
766 nbytes = (size_t)(n+1) * sizeof(*result);
767 if (nbytes / sizeof(*result) != (n+1))
769 gpg_err_set_errno (ENOMEM);
772 result = xtrymalloc (nbytes);
776 n = MultiByteToWideChar (CP_UTF8, 0, string, -1, result, n);
780 gpg_err_set_errno (EINVAL);
785 #endif /*HAVE_W32_SYSTEM*/