1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009, 2010
4 Free Software Foundation, Inc.
6 This file is part of GDB.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
24 #include "gdb_assert.h"
25 #include "gdb_obstack.h"
27 #include "charset-list.h"
32 #include "gdb_string.h"
36 /* How GDB's character set support works
38 GDB has three global settings:
40 - The `current host character set' is the character set GDB should
41 use in talking to the user, and which (hopefully) the user's
42 terminal knows how to display properly. Most users should not
45 - The `current target character set' is the character set the
46 program being debugged uses.
48 - The `current target wide character set' is the wide character set
49 the program being debugged uses, that is, the encoding used for
52 There are commands to set each of these, and mechanisms for
53 choosing reasonable default values. GDB has a global list of
54 character sets that it can use as its host or target character
57 The header file `charset.h' declares various functions that
58 different pieces of GDB need to perform tasks like:
60 - printing target strings and characters to the user's terminal
61 (mostly target->host conversions),
63 - building target-appropriate representations of strings and
64 characters the user enters in expressions (mostly host->target
69 To avoid excessive code duplication and maintenance efforts,
70 GDB simply requires a capable iconv function. Users on platforms
71 without a suitable iconv can use the GNU iconv library. */
76 /* Provide a phony iconv that does as little as possible. Also,
77 arrange for there to be a single available character set. */
79 #undef GDB_DEFAULT_HOST_CHARSET
80 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
81 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
82 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
83 #undef DEFAULT_CHARSET_NAMES
84 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
93 #define ICONV_CONST const
95 /* Some systems don't have EILSEQ, so we define it here, but not as
96 EINVAL, because callers of `iconv' want to distinguish EINVAL and
97 EILSEQ. This is what iconv.h from libiconv does as well. Note
98 that wchar.h may also define EILSEQ, so this needs to be after we
99 include wchar.h, which happens in defs.h through gdb_wchar.h. */
101 #define EILSEQ ENOENT
105 iconv_open (const char *to, const char *from)
107 /* We allow conversions from UTF-32BE, wchar_t, and the host charset.
108 We allow conversions to wchar_t and the host charset. */
109 if (strcmp (from, "UTF-32BE") && strcmp (from, "wchar_t")
110 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
112 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
115 /* Return 1 if we are converting from UTF-32BE, 0 otherwise. This is
116 used as a flag in calls to iconv. */
117 return !strcmp (from, "UTF-32BE");
121 iconv_close (iconv_t arg)
127 iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
128 char **outbuf, size_t *outbytesleft)
132 while (*inbytesleft >= 4)
137 for (j = 0; j < 4; ++j)
140 c += (*inbuf)[j] & 0xff;
155 if (*inbytesleft < 4)
163 /* In all other cases we simply copy input bytes to the
165 size_t amt = *inbytesleft;
166 if (amt > *outbytesleft)
168 memcpy (*outbuf, *inbuf, amt);
172 *outbytesleft -= amt;
181 /* The number of non-reversible conversions -- but they were all
190 /* The global lists of character sets and translations. */
193 #ifndef GDB_DEFAULT_TARGET_CHARSET
194 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
197 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
198 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
201 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
202 static const char *host_charset_name = "auto";
204 show_host_charset_name (struct ui_file *file, int from_tty,
205 struct cmd_list_element *c,
208 if (!strcmp (value, "auto"))
209 fprintf_filtered (file,
210 _("The host character set is \"auto; currently %s\".\n"),
211 auto_host_charset_name);
213 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
216 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
218 show_target_charset_name (struct ui_file *file, int from_tty,
219 struct cmd_list_element *c, const char *value)
221 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
225 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
227 show_target_wide_charset_name (struct ui_file *file, int from_tty,
228 struct cmd_list_element *c, const char *value)
230 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
234 static const char *default_charset_names[] =
236 DEFAULT_CHARSET_NAMES
240 static const char **charset_enum;
243 /* If the target wide character set has big- or little-endian
244 variants, these are the corresponding names. */
245 static const char *target_wide_charset_be_name;
246 static const char *target_wide_charset_le_name;
248 /* A helper function for validate which sets the target wide big- and
249 little-endian character set names, if possible. */
252 set_be_le_names (void)
256 target_wide_charset_le_name = NULL;
257 target_wide_charset_be_name = NULL;
259 len = strlen (target_wide_charset_name);
260 for (i = 0; charset_enum[i]; ++i)
262 if (strncmp (target_wide_charset_name, charset_enum[i], len))
264 if ((charset_enum[i][len] == 'B'
265 || charset_enum[i][len] == 'L')
266 && charset_enum[i][len + 1] == 'E'
267 && charset_enum[i][len + 2] == '\0')
269 if (charset_enum[i][len] == 'B')
270 target_wide_charset_be_name = charset_enum[i];
272 target_wide_charset_le_name = charset_enum[i];
277 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
278 target-wide-charset', 'set charset' sfunc's. */
284 const char *host_cset = host_charset ();
286 desc = iconv_open (target_wide_charset_name, host_cset);
287 if (desc == (iconv_t) -1)
288 error ("Cannot convert between character sets `%s' and `%s'",
289 target_wide_charset_name, host_cset);
292 desc = iconv_open (target_charset_name, host_cset);
293 if (desc == (iconv_t) -1)
294 error ("Cannot convert between character sets `%s' and `%s'",
295 target_charset_name, host_cset);
301 /* This is the sfunc for the 'set charset' command. */
303 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
305 /* CAREFUL: set the target charset here as well. */
306 target_charset_name = host_charset_name;
310 /* 'set host-charset' command sfunc. We need a wrapper here because
311 the function needs to have a specific signature. */
313 set_host_charset_sfunc (char *charset, int from_tty,
314 struct cmd_list_element *c)
319 /* Wrapper for the 'set target-charset' command. */
321 set_target_charset_sfunc (char *charset, int from_tty,
322 struct cmd_list_element *c)
327 /* Wrapper for the 'set target-wide-charset' command. */
329 set_target_wide_charset_sfunc (char *charset, int from_tty,
330 struct cmd_list_element *c)
335 /* sfunc for the 'show charset' command. */
337 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
340 show_host_charset_name (file, from_tty, c, host_charset_name);
341 show_target_charset_name (file, from_tty, c, target_charset_name);
342 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
346 /* Accessor functions. */
351 if (!strcmp (host_charset_name, "auto"))
352 return auto_host_charset_name;
353 return host_charset_name;
357 target_charset (void)
359 return target_charset_name;
363 target_wide_charset (enum bfd_endian byte_order)
365 if (byte_order == BFD_ENDIAN_BIG)
367 if (target_wide_charset_be_name)
368 return target_wide_charset_be_name;
372 if (target_wide_charset_le_name)
373 return target_wide_charset_le_name;
376 return target_wide_charset_name;
380 /* Host character set management. For the time being, we assume that
381 the host character set is some superset of ASCII. */
384 host_letter_to_control_character (char c)
391 /* Convert a host character, C, to its hex value. C must already have
392 been validated using isxdigit. */
395 host_hex_value (char c)
399 if (c >= 'a' && c <= 'f')
401 gdb_assert (c >= 'A' && c <= 'F');
406 /* Public character management functions. */
408 /* A cleanup function which is run to close an iconv descriptor. */
411 cleanup_iconv (void *p)
414 iconv_close (*descp);
418 convert_between_encodings (const char *from, const char *to,
419 const gdb_byte *bytes, unsigned int num_bytes,
420 int width, struct obstack *output,
421 enum transliterations translit)
424 struct cleanup *cleanups;
427 unsigned int space_request;
429 /* Often, the host and target charsets will be the same. */
430 if (!strcmp (from, to))
432 obstack_grow (output, bytes, num_bytes);
436 desc = iconv_open (to, from);
437 if (desc == (iconv_t) -1)
438 perror_with_name ("Converting character sets");
439 cleanups = make_cleanup (cleanup_iconv, &desc);
442 inp = (char *) bytes;
444 space_request = num_bytes;
452 old_size = obstack_object_size (output);
453 obstack_blank (output, space_request);
455 outp = obstack_base (output) + old_size;
456 outleft = space_request;
458 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
460 /* Now make sure that the object on the obstack only includes
461 bytes we have converted. */
462 obstack_blank (output, - (int) outleft);
464 if (r == (size_t) -1)
472 /* Invalid input sequence. */
473 if (translit == translit_none)
474 error (_("Could not convert character to `%s' character set"),
477 /* We emit escape sequence for the bytes, skip them,
479 for (i = 0; i < width; ++i)
483 sprintf (octal, "\\%.3o", *inp & 0xff);
484 obstack_grow_str (output, octal);
493 /* We ran out of space in the output buffer. Make it
494 bigger next time around. */
499 /* Incomplete input sequence. FIXME: ought to report this
500 to the caller somehow. */
505 perror_with_name ("Internal error while converting character sets");
510 do_cleanups (cleanups);
515 /* An iterator that returns host wchar_t's from a target string. */
516 struct wchar_iterator
518 /* The underlying iconv descriptor. */
521 /* The input string. This is updated as convert characters. */
523 /* The number of bytes remaining in the input. */
526 /* The width of an input character. */
529 /* The output buffer and its size. */
534 /* Create a new iterator. */
535 struct wchar_iterator *
536 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
539 struct wchar_iterator *result;
542 desc = iconv_open (INTERMEDIATE_ENCODING, charset);
543 if (desc == (iconv_t) -1)
544 perror_with_name ("Converting character sets");
546 result = XNEW (struct wchar_iterator);
548 result->input = (char *) input;
549 result->bytes = bytes;
550 result->width = width;
552 result->out = XNEW (gdb_wchar_t);
553 result->out_size = 1;
559 do_cleanup_iterator (void *p)
561 struct wchar_iterator *iter = p;
563 iconv_close (iter->desc);
569 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
571 return make_cleanup (do_cleanup_iterator, iter);
575 wchar_iterate (struct wchar_iterator *iter,
576 enum wchar_iterate_result *out_result,
577 gdb_wchar_t **out_chars,
578 const gdb_byte **ptr,
583 /* Try to convert some characters. At first we try to convert just
584 a single character. The reason for this is that iconv does not
585 necessarily update its outgoing arguments when it encounters an
586 invalid input sequence -- but we want to reliably report this to
587 our caller so it can emit an escape sequence. */
589 while (iter->bytes > 0)
591 char *outptr = (char *) &iter->out[0];
592 char *orig_inptr = iter->input;
593 size_t orig_in = iter->bytes;
594 size_t out_avail = out_request * sizeof (gdb_wchar_t);
598 size_t r = iconv (iter->desc,
599 (ICONV_CONST char **) &iter->input, &iter->bytes,
600 &outptr, &out_avail);
601 if (r == (size_t) -1)
606 /* Invalid input sequence. Skip it, and let the caller
608 *out_result = wchar_iterate_invalid;
611 iter->input += iter->width;
612 iter->bytes -= iter->width;
616 /* We ran out of space. We still might have converted a
617 character; if so, return it. Otherwise, grow the
618 buffer and try again. */
619 if (out_avail < out_request * sizeof (gdb_wchar_t))
623 if (out_request > iter->out_size)
625 iter->out_size = out_request;
626 iter->out = xrealloc (iter->out,
627 out_request * sizeof (gdb_wchar_t));
632 /* Incomplete input sequence. Let the caller know, and
633 arrange for future calls to see EOF. */
634 *out_result = wchar_iterate_incomplete;
641 perror_with_name ("Internal error while converting character sets");
645 /* We converted something. */
646 num = out_request - out_avail / sizeof (gdb_wchar_t);
647 *out_result = wchar_iterate_ok;
648 *out_chars = iter->out;
650 *len = orig_in - iter->bytes;
655 *out_result = wchar_iterate_eof;
660 /* The charset.c module initialization function. */
662 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
664 typedef char *char_ptr;
665 DEF_VEC_P (char_ptr);
667 static VEC (char_ptr) *charsets;
672 find_charset_names (void)
674 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
675 VEC_safe_push (char_ptr, charsets, NULL);
678 #else /* PHONY_ICONV */
680 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
681 provides different symbols in the static and dynamic libraries.
682 So, configure may see libiconvlist but not iconvlist. But, calling
683 iconvlist is the right thing to do and will work. Hence we do a
684 check here but unconditionally call iconvlist below. */
685 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
687 /* A helper function that adds some character sets to the vector of
688 all character sets. This is a callback function for iconvlist. */
691 add_one (unsigned int count, const char *const *names, void *data)
695 for (i = 0; i < count; ++i)
696 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
702 find_charset_names (void)
704 iconvlist (add_one, NULL);
705 VEC_safe_push (char_ptr, charsets, NULL);
710 /* Return non-zero if LINE (output from iconv) should be ignored.
711 Older iconv programs (e.g. 2.2.2) include the human readable
712 introduction even when stdout is not a tty. Newer versions omit
713 the intro if stdout is not a tty. */
716 ignore_line_p (const char *line)
718 /* This table is used to filter the output. If this text appears
719 anywhere in the line, it is ignored (strstr is used). */
720 static const char * const ignore_lines[] =
725 "listed with several",
730 for (i = 0; ignore_lines[i] != NULL; ++i)
732 if (strstr (line, ignore_lines[i]) != NULL)
740 find_charset_names (void)
742 struct pex_obj *child;
746 struct gdb_environ *iconv_env;
748 /* Older iconvs, e.g. 2.2.2, don't omit the intro text if stdout is not
749 a tty. We need to recognize it and ignore it. This text is subject
750 to translation, so force LANGUAGE=C. */
751 iconv_env = make_environ ();
752 init_environ (iconv_env);
753 set_in_environ (iconv_env, "LANGUAGE", "C");
754 set_in_environ (iconv_env, "LC_ALL", "C");
756 child = pex_init (0, "iconv", NULL);
761 /* Note that we simply ignore errors here. */
762 if (!pex_run_in_environment (child, PEX_SEARCH | PEX_STDERR_TO_STDOUT,
763 "iconv", args, environ_vector (iconv_env),
766 FILE *in = pex_read_output (child, 0);
768 /* POSIX says that iconv -l uses an unspecified format. We
769 parse the glibc and libiconv formats; feel free to add others
774 /* The size of buf is chosen arbitrarily. */
779 r = fgets (buf, sizeof (buf), in);
785 if (ignore_line_p (r))
788 /* Strip off the newline. */
790 /* Strip off one or two '/'s. glibc will print lines like
791 "8859_7//", but also "10646-1:1993/UCS4/". */
792 if (buf[len - 1] == '/')
794 if (buf[len - 1] == '/')
798 /* libiconv will print multiple entries per line, separated
799 by spaces. Older iconvs will print multiple entries per line,
800 indented by two spaces, and separated by ", "
801 (i.e. the human readable form). */
808 /* Skip leading blanks. */
809 for (p = start; *p && *p == ' '; ++p)
812 /* Find the next space, comma, or end-of-line. */
813 for ( ; *p && *p != ' ' && *p != ','; ++p)
815 /* Ignore an empty result. */
820 VEC_safe_push (char_ptr, charsets, xstrdup (start));
823 /* Skip any extra spaces. */
824 for (start = p + 1; *start && *start == ' '; ++start)
829 if (pex_get_status (child, 1, &status)
830 && WIFEXITED (status) && !WEXITSTATUS (status))
836 free_environ (iconv_env);
840 /* Some error occurred, so drop the vector. */
843 for (ix = 0; VEC_iterate (char_ptr, charsets, ix, elt); ++ix)
845 VEC_truncate (char_ptr, charsets, 0);
848 VEC_safe_push (char_ptr, charsets, NULL);
851 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
852 #endif /* PHONY_ICONV */
855 _initialize_charset (void)
857 struct cmd_list_element *new_cmd;
859 /* The first element is always "auto"; then we skip it for the
860 commands where it is not allowed. */
861 VEC_safe_push (char_ptr, charsets, xstrdup ("auto"));
862 find_charset_names ();
864 if (VEC_length (char_ptr, charsets) > 1)
865 charset_enum = (const char **) VEC_address (char_ptr, charsets);
867 charset_enum = default_charset_names;
870 #ifdef HAVE_LANGINFO_CODESET
871 auto_host_charset_name = nl_langinfo (CODESET);
872 /* Solaris will return `646' here -- but the Solaris iconv then
873 does not accept this. Darwin (and maybe FreeBSD) may return "" here,
874 which GNU libiconv doesn't like (infinite loop). */
875 if (!strcmp (auto_host_charset_name, "646") || !*auto_host_charset_name)
876 auto_host_charset_name = "ASCII";
877 target_charset_name = auto_host_charset_name;
883 add_setshow_enum_cmd ("charset", class_support,
884 &charset_enum[1], &host_charset_name, _("\
885 Set the host and target character sets."), _("\
886 Show the host and target character sets."), _("\
887 The `host character set' is the one used by the system GDB is running on.\n\
888 The `target character set' is the one used by the program being debugged.\n\
889 You may only use supersets of ASCII for your host character set; GDB does\n\
890 not support any others.\n\
891 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
892 /* Note that the sfunc below needs to set
893 target_charset_name, because the 'set
894 charset' command sets two variables. */
897 &setlist, &showlist);
899 add_setshow_enum_cmd ("host-charset", class_support,
900 charset_enum, &host_charset_name, _("\
901 Set the host character set."), _("\
902 Show the host character set."), _("\
903 The `host character set' is the one used by the system GDB is running on.\n\
904 You may only use supersets of ASCII for your host character set; GDB does\n\
905 not support any others.\n\
906 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
907 set_host_charset_sfunc,
908 show_host_charset_name,
909 &setlist, &showlist);
911 add_setshow_enum_cmd ("target-charset", class_support,
912 &charset_enum[1], &target_charset_name, _("\
913 Set the target character set."), _("\
914 Show the target character set."), _("\
915 The `target character set' is the one used by the program being debugged.\n\
916 GDB translates characters and strings between the host and target\n\
917 character sets as needed.\n\
918 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
919 set_target_charset_sfunc,
920 show_target_charset_name,
921 &setlist, &showlist);
923 add_setshow_enum_cmd ("target-wide-charset", class_support,
924 &charset_enum[1], &target_wide_charset_name,
926 Set the target wide character set."), _("\
927 Show the target wide character set."), _("\
928 The `target wide character set' is the one used by the program being debugged.\n\
929 In particular it is the encoding used by `wchar_t'.\n\
930 GDB translates characters and strings between the host and target\n\
931 character sets as needed.\n\
932 To see a list of the character sets GDB supports, type\n\
933 `set target-wide-charset'<TAB>"),
934 set_target_wide_charset_sfunc,
935 show_target_wide_charset_name,
936 &setlist, &showlist);