1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009, 2010
4 Free Software Foundation, Inc.
6 This file is part of GDB.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
24 #include "gdb_assert.h"
25 #include "gdb_obstack.h"
27 #include "charset-list.h"
30 #include "arch-utils.h"
33 #include "gdb_string.h"
37 /* How GDB's character set support works
39 GDB has three global settings:
41 - The `current host character set' is the character set GDB should
42 use in talking to the user, and which (hopefully) the user's
43 terminal knows how to display properly. Most users should not
46 - The `current target character set' is the character set the
47 program being debugged uses.
49 - The `current target wide character set' is the wide character set
50 the program being debugged uses, that is, the encoding used for
53 There are commands to set each of these, and mechanisms for
54 choosing reasonable default values. GDB has a global list of
55 character sets that it can use as its host or target character
58 The header file `charset.h' declares various functions that
59 different pieces of GDB need to perform tasks like:
61 - printing target strings and characters to the user's terminal
62 (mostly target->host conversions),
64 - building target-appropriate representations of strings and
65 characters the user enters in expressions (mostly host->target
70 To avoid excessive code duplication and maintenance efforts,
71 GDB simply requires a capable iconv function. Users on platforms
72 without a suitable iconv can use the GNU iconv library. */
77 /* Provide a phony iconv that does as little as possible. Also,
78 arrange for there to be a single available character set. */
80 #undef GDB_DEFAULT_HOST_CHARSET
81 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
82 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
83 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
84 #undef DEFAULT_CHARSET_NAMES
85 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
94 #define ICONV_CONST const
96 /* Some systems don't have EILSEQ, so we define it here, but not as
97 EINVAL, because callers of `iconv' want to distinguish EINVAL and
98 EILSEQ. This is what iconv.h from libiconv does as well. Note
99 that wchar.h may also define EILSEQ, so this needs to be after we
100 include wchar.h, which happens in defs.h through gdb_wchar.h. */
102 #define EILSEQ ENOENT
106 iconv_open (const char *to, const char *from)
108 /* We allow conversions from UTF-32BE, wchar_t, and the host charset.
109 We allow conversions to wchar_t and the host charset. */
110 if (strcmp (from, "UTF-32BE") && strcmp (from, "wchar_t")
111 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
113 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
116 /* Return 1 if we are converting from UTF-32BE, 0 otherwise. This is
117 used as a flag in calls to iconv. */
118 return !strcmp (from, "UTF-32BE");
122 iconv_close (iconv_t arg)
128 iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
129 char **outbuf, size_t *outbytesleft)
133 while (*inbytesleft >= 4)
138 for (j = 0; j < 4; ++j)
141 c += (*inbuf)[j] & 0xff;
156 if (*inbytesleft < 4)
164 /* In all other cases we simply copy input bytes to the
166 size_t amt = *inbytesleft;
167 if (amt > *outbytesleft)
169 memcpy (*outbuf, *inbuf, amt);
173 *outbytesleft -= amt;
182 /* The number of non-reversible conversions -- but they were all
191 /* The global lists of character sets and translations. */
194 #ifndef GDB_DEFAULT_TARGET_CHARSET
195 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
198 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
199 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
202 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
203 static const char *host_charset_name = "auto";
205 show_host_charset_name (struct ui_file *file, int from_tty,
206 struct cmd_list_element *c,
209 if (!strcmp (value, "auto"))
210 fprintf_filtered (file,
211 _("The host character set is \"auto; currently %s\".\n"),
212 auto_host_charset_name);
214 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
217 static const char *target_charset_name = "auto";
219 show_target_charset_name (struct ui_file *file, int from_tty,
220 struct cmd_list_element *c, const char *value)
222 if (!strcmp (value, "auto"))
223 fprintf_filtered (file,
224 _("The target character set is \"auto; "
225 "currently %s\".\n"),
226 gdbarch_auto_charset (get_current_arch ()));
228 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
232 static const char *target_wide_charset_name = "auto";
234 show_target_wide_charset_name (struct ui_file *file, int from_tty,
235 struct cmd_list_element *c, const char *value)
237 if (!strcmp (value, "auto"))
238 fprintf_filtered (file,
239 _("The target wide character set is \"auto; "
240 "currently %s\".\n"),
241 gdbarch_auto_wide_charset (get_current_arch ()));
243 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
247 static const char *default_charset_names[] =
249 DEFAULT_CHARSET_NAMES
253 static const char **charset_enum;
256 /* If the target wide character set has big- or little-endian
257 variants, these are the corresponding names. */
258 static const char *target_wide_charset_be_name;
259 static const char *target_wide_charset_le_name;
261 /* The architecture for which the BE- and LE-names are valid. */
262 static struct gdbarch *be_le_arch;
264 /* A helper function which sets the target wide big- and little-endian
265 character set names, if possible. */
268 set_be_le_names (struct gdbarch *gdbarch)
271 const char *target_wide;
273 if (be_le_arch == gdbarch)
275 be_le_arch = gdbarch;
277 target_wide_charset_le_name = NULL;
278 target_wide_charset_be_name = NULL;
280 target_wide = target_wide_charset_name;
281 if (!strcmp (target_wide, "auto"))
282 target_wide = gdbarch_auto_wide_charset (gdbarch);
284 len = strlen (target_wide);
285 for (i = 0; charset_enum[i]; ++i)
287 if (strncmp (target_wide, charset_enum[i], len))
289 if ((charset_enum[i][len] == 'B'
290 || charset_enum[i][len] == 'L')
291 && charset_enum[i][len + 1] == 'E'
292 && charset_enum[i][len + 2] == '\0')
294 if (charset_enum[i][len] == 'B')
295 target_wide_charset_be_name = charset_enum[i];
297 target_wide_charset_le_name = charset_enum[i];
302 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
303 target-wide-charset', 'set charset' sfunc's. */
306 validate (struct gdbarch *gdbarch)
309 const char *host_cset = host_charset ();
310 const char *target_cset = target_charset (gdbarch);
311 const char *target_wide_cset = target_wide_charset_name;
312 if (!strcmp (target_wide_cset, "auto"))
313 target_wide_cset = gdbarch_auto_wide_charset (gdbarch);
315 desc = iconv_open (target_wide_cset, host_cset);
316 if (desc == (iconv_t) -1)
317 error ("Cannot convert between character sets `%s' and `%s'",
318 target_wide_cset, host_cset);
321 desc = iconv_open (target_cset, host_cset);
322 if (desc == (iconv_t) -1)
323 error ("Cannot convert between character sets `%s' and `%s'",
324 target_cset, host_cset);
327 /* Clear the cache. */
331 /* This is the sfunc for the 'set charset' command. */
333 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
335 /* CAREFUL: set the target charset here as well. */
336 target_charset_name = host_charset_name;
337 validate (get_current_arch ());
340 /* 'set host-charset' command sfunc. We need a wrapper here because
341 the function needs to have a specific signature. */
343 set_host_charset_sfunc (char *charset, int from_tty,
344 struct cmd_list_element *c)
346 validate (get_current_arch ());
349 /* Wrapper for the 'set target-charset' command. */
351 set_target_charset_sfunc (char *charset, int from_tty,
352 struct cmd_list_element *c)
354 validate (get_current_arch ());
357 /* Wrapper for the 'set target-wide-charset' command. */
359 set_target_wide_charset_sfunc (char *charset, int from_tty,
360 struct cmd_list_element *c)
362 validate (get_current_arch ());
365 /* sfunc for the 'show charset' command. */
367 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
370 show_host_charset_name (file, from_tty, c, host_charset_name);
371 show_target_charset_name (file, from_tty, c, target_charset_name);
372 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
376 /* Accessor functions. */
381 if (!strcmp (host_charset_name, "auto"))
382 return auto_host_charset_name;
383 return host_charset_name;
387 target_charset (struct gdbarch *gdbarch)
389 if (!strcmp (target_charset_name, "auto"))
390 return gdbarch_auto_charset (gdbarch);
391 return target_charset_name;
395 target_wide_charset (struct gdbarch *gdbarch)
397 enum bfd_endian byte_order = gdbarch_byte_order (gdbarch);
399 set_be_le_names (gdbarch);
400 if (byte_order == BFD_ENDIAN_BIG)
402 if (target_wide_charset_be_name)
403 return target_wide_charset_be_name;
407 if (target_wide_charset_le_name)
408 return target_wide_charset_le_name;
411 if (!strcmp (target_wide_charset_name, "auto"))
412 return gdbarch_auto_wide_charset (gdbarch);
414 return target_wide_charset_name;
418 /* Host character set management. For the time being, we assume that
419 the host character set is some superset of ASCII. */
422 host_letter_to_control_character (char c)
429 /* Convert a host character, C, to its hex value. C must already have
430 been validated using isxdigit. */
433 host_hex_value (char c)
437 if (c >= 'a' && c <= 'f')
439 gdb_assert (c >= 'A' && c <= 'F');
444 /* Public character management functions. */
446 /* A cleanup function which is run to close an iconv descriptor. */
449 cleanup_iconv (void *p)
452 iconv_close (*descp);
456 convert_between_encodings (const char *from, const char *to,
457 const gdb_byte *bytes, unsigned int num_bytes,
458 int width, struct obstack *output,
459 enum transliterations translit)
462 struct cleanup *cleanups;
465 unsigned int space_request;
467 /* Often, the host and target charsets will be the same. */
468 if (!strcmp (from, to))
470 obstack_grow (output, bytes, num_bytes);
474 desc = iconv_open (to, from);
475 if (desc == (iconv_t) -1)
476 perror_with_name ("Converting character sets");
477 cleanups = make_cleanup (cleanup_iconv, &desc);
480 inp = (char *) bytes;
482 space_request = num_bytes;
490 old_size = obstack_object_size (output);
491 obstack_blank (output, space_request);
493 outp = obstack_base (output) + old_size;
494 outleft = space_request;
496 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
498 /* Now make sure that the object on the obstack only includes
499 bytes we have converted. */
500 obstack_blank (output, - (int) outleft);
502 if (r == (size_t) -1)
510 /* Invalid input sequence. */
511 if (translit == translit_none)
512 error (_("Could not convert character to `%s' character set"),
515 /* We emit escape sequence for the bytes, skip them,
517 for (i = 0; i < width; ++i)
521 sprintf (octal, "\\%.3o", *inp & 0xff);
522 obstack_grow_str (output, octal);
531 /* We ran out of space in the output buffer. Make it
532 bigger next time around. */
537 /* Incomplete input sequence. FIXME: ought to report this
538 to the caller somehow. */
543 perror_with_name ("Internal error while converting character sets");
548 do_cleanups (cleanups);
553 /* An iterator that returns host wchar_t's from a target string. */
554 struct wchar_iterator
556 /* The underlying iconv descriptor. */
559 /* The input string. This is updated as convert characters. */
561 /* The number of bytes remaining in the input. */
564 /* The width of an input character. */
567 /* The output buffer and its size. */
572 /* Create a new iterator. */
573 struct wchar_iterator *
574 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
577 struct wchar_iterator *result;
580 desc = iconv_open (INTERMEDIATE_ENCODING, charset);
581 if (desc == (iconv_t) -1)
582 perror_with_name ("Converting character sets");
584 result = XNEW (struct wchar_iterator);
586 result->input = (char *) input;
587 result->bytes = bytes;
588 result->width = width;
590 result->out = XNEW (gdb_wchar_t);
591 result->out_size = 1;
597 do_cleanup_iterator (void *p)
599 struct wchar_iterator *iter = p;
601 iconv_close (iter->desc);
607 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
609 return make_cleanup (do_cleanup_iterator, iter);
613 wchar_iterate (struct wchar_iterator *iter,
614 enum wchar_iterate_result *out_result,
615 gdb_wchar_t **out_chars,
616 const gdb_byte **ptr,
621 /* Try to convert some characters. At first we try to convert just
622 a single character. The reason for this is that iconv does not
623 necessarily update its outgoing arguments when it encounters an
624 invalid input sequence -- but we want to reliably report this to
625 our caller so it can emit an escape sequence. */
627 while (iter->bytes > 0)
629 char *outptr = (char *) &iter->out[0];
630 char *orig_inptr = iter->input;
631 size_t orig_in = iter->bytes;
632 size_t out_avail = out_request * sizeof (gdb_wchar_t);
636 size_t r = iconv (iter->desc,
637 (ICONV_CONST char **) &iter->input, &iter->bytes,
638 &outptr, &out_avail);
639 if (r == (size_t) -1)
644 /* Invalid input sequence. Skip it, and let the caller
646 *out_result = wchar_iterate_invalid;
649 iter->input += iter->width;
650 iter->bytes -= iter->width;
654 /* We ran out of space. We still might have converted a
655 character; if so, return it. Otherwise, grow the
656 buffer and try again. */
657 if (out_avail < out_request * sizeof (gdb_wchar_t))
661 if (out_request > iter->out_size)
663 iter->out_size = out_request;
664 iter->out = xrealloc (iter->out,
665 out_request * sizeof (gdb_wchar_t));
670 /* Incomplete input sequence. Let the caller know, and
671 arrange for future calls to see EOF. */
672 *out_result = wchar_iterate_incomplete;
679 perror_with_name ("Internal error while converting character sets");
683 /* We converted something. */
684 num = out_request - out_avail / sizeof (gdb_wchar_t);
685 *out_result = wchar_iterate_ok;
686 *out_chars = iter->out;
688 *len = orig_in - iter->bytes;
693 *out_result = wchar_iterate_eof;
698 /* The charset.c module initialization function. */
700 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
702 typedef char *char_ptr;
703 DEF_VEC_P (char_ptr);
705 static VEC (char_ptr) *charsets;
710 find_charset_names (void)
712 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
713 VEC_safe_push (char_ptr, charsets, NULL);
716 #else /* PHONY_ICONV */
718 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
719 provides different symbols in the static and dynamic libraries.
720 So, configure may see libiconvlist but not iconvlist. But, calling
721 iconvlist is the right thing to do and will work. Hence we do a
722 check here but unconditionally call iconvlist below. */
723 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
725 /* A helper function that adds some character sets to the vector of
726 all character sets. This is a callback function for iconvlist. */
729 add_one (unsigned int count, const char *const *names, void *data)
733 for (i = 0; i < count; ++i)
734 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
740 find_charset_names (void)
742 iconvlist (add_one, NULL);
743 VEC_safe_push (char_ptr, charsets, NULL);
748 /* Return non-zero if LINE (output from iconv) should be ignored.
749 Older iconv programs (e.g. 2.2.2) include the human readable
750 introduction even when stdout is not a tty. Newer versions omit
751 the intro if stdout is not a tty. */
754 ignore_line_p (const char *line)
756 /* This table is used to filter the output. If this text appears
757 anywhere in the line, it is ignored (strstr is used). */
758 static const char * const ignore_lines[] =
763 "listed with several",
768 for (i = 0; ignore_lines[i] != NULL; ++i)
770 if (strstr (line, ignore_lines[i]) != NULL)
778 find_charset_names (void)
780 struct pex_obj *child;
784 struct gdb_environ *iconv_env;
786 /* Older iconvs, e.g. 2.2.2, don't omit the intro text if stdout is not
787 a tty. We need to recognize it and ignore it. This text is subject
788 to translation, so force LANGUAGE=C. */
789 iconv_env = make_environ ();
790 init_environ (iconv_env);
791 set_in_environ (iconv_env, "LANGUAGE", "C");
792 set_in_environ (iconv_env, "LC_ALL", "C");
794 child = pex_init (0, "iconv", NULL);
799 /* Note that we simply ignore errors here. */
800 if (!pex_run_in_environment (child, PEX_SEARCH | PEX_STDERR_TO_STDOUT,
801 "iconv", args, environ_vector (iconv_env),
804 FILE *in = pex_read_output (child, 0);
806 /* POSIX says that iconv -l uses an unspecified format. We
807 parse the glibc and libiconv formats; feel free to add others
812 /* The size of buf is chosen arbitrarily. */
817 r = fgets (buf, sizeof (buf), in);
823 if (ignore_line_p (r))
826 /* Strip off the newline. */
828 /* Strip off one or two '/'s. glibc will print lines like
829 "8859_7//", but also "10646-1:1993/UCS4/". */
830 if (buf[len - 1] == '/')
832 if (buf[len - 1] == '/')
836 /* libiconv will print multiple entries per line, separated
837 by spaces. Older iconvs will print multiple entries per line,
838 indented by two spaces, and separated by ", "
839 (i.e. the human readable form). */
846 /* Skip leading blanks. */
847 for (p = start; *p && *p == ' '; ++p)
850 /* Find the next space, comma, or end-of-line. */
851 for ( ; *p && *p != ' ' && *p != ','; ++p)
853 /* Ignore an empty result. */
858 VEC_safe_push (char_ptr, charsets, xstrdup (start));
861 /* Skip any extra spaces. */
862 for (start = p + 1; *start && *start == ' '; ++start)
867 if (pex_get_status (child, 1, &status)
868 && WIFEXITED (status) && !WEXITSTATUS (status))
874 free_environ (iconv_env);
878 /* Some error occurred, so drop the vector. */
881 for (ix = 0; VEC_iterate (char_ptr, charsets, ix, elt); ++ix)
883 VEC_truncate (char_ptr, charsets, 0);
886 VEC_safe_push (char_ptr, charsets, NULL);
889 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
890 #endif /* PHONY_ICONV */
892 /* The "auto" target charset used by default_auto_charset. */
893 static const char *auto_target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
896 default_auto_charset (void)
898 return auto_target_charset_name;
902 default_auto_wide_charset (void)
904 return GDB_DEFAULT_TARGET_WIDE_CHARSET;
908 _initialize_charset (void)
910 struct cmd_list_element *new_cmd;
912 /* The first element is always "auto". */
913 VEC_safe_push (char_ptr, charsets, xstrdup ("auto"));
914 find_charset_names ();
916 if (VEC_length (char_ptr, charsets) > 1)
917 charset_enum = (const char **) VEC_address (char_ptr, charsets);
919 charset_enum = default_charset_names;
922 #ifdef HAVE_LANGINFO_CODESET
923 /* The result of nl_langinfo may be overwritten later. This may
924 leak a little memory, if the user later changes the host charset,
925 but that doesn't matter much. */
926 auto_host_charset_name = xstrdup (nl_langinfo (CODESET));
927 /* Solaris will return `646' here -- but the Solaris iconv then
928 does not accept this. Darwin (and maybe FreeBSD) may return "" here,
929 which GNU libiconv doesn't like (infinite loop). */
930 if (!strcmp (auto_host_charset_name, "646") || !*auto_host_charset_name)
931 auto_host_charset_name = "ASCII";
932 auto_target_charset_name = auto_host_charset_name;
933 #elif defined (USE_WIN32API)
935 static w32_host_default_charset[16]; /* "CP" + x<=5 digits + paranoia. */
937 snprintf (w32_host_default_charset, sizeof w32_host_default_charset,
939 auto_host_charset_name = w32_host_default_charset;
940 auto_target_charset_name = auto_host_charset_name;
945 add_setshow_enum_cmd ("charset", class_support,
946 charset_enum, &host_charset_name, _("\
947 Set the host and target character sets."), _("\
948 Show the host and target character sets."), _("\
949 The `host character set' is the one used by the system GDB is running on.\n\
950 The `target character set' is the one used by the program being debugged.\n\
951 You may only use supersets of ASCII for your host character set; GDB does\n\
952 not support any others.\n\
953 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
954 /* Note that the sfunc below needs to set
955 target_charset_name, because the 'set
956 charset' command sets two variables. */
959 &setlist, &showlist);
961 add_setshow_enum_cmd ("host-charset", class_support,
962 charset_enum, &host_charset_name, _("\
963 Set the host character set."), _("\
964 Show the host character set."), _("\
965 The `host character set' is the one used by the system GDB is running on.\n\
966 You may only use supersets of ASCII for your host character set; GDB does\n\
967 not support any others.\n\
968 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
969 set_host_charset_sfunc,
970 show_host_charset_name,
971 &setlist, &showlist);
973 add_setshow_enum_cmd ("target-charset", class_support,
974 charset_enum, &target_charset_name, _("\
975 Set the target character set."), _("\
976 Show the target character set."), _("\
977 The `target character set' is the one used by the program being debugged.\n\
978 GDB translates characters and strings between the host and target\n\
979 character sets as needed.\n\
980 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
981 set_target_charset_sfunc,
982 show_target_charset_name,
983 &setlist, &showlist);
985 add_setshow_enum_cmd ("target-wide-charset", class_support,
986 charset_enum, &target_wide_charset_name,
988 Set the target wide character set."), _("\
989 Show the target wide character set."), _("\
990 The `target wide character set' is the one used by the program being debugged.\n\
991 In particular it is the encoding used by `wchar_t'.\n\
992 GDB translates characters and strings between the host and target\n\
993 character sets as needed.\n\
994 To see a list of the character sets GDB supports, type\n\
995 `set target-wide-charset'<TAB>"),
996 set_target_wide_charset_sfunc,
997 show_target_wide_charset_name,
998 &setlist, &showlist);