1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
26 #include "charset-list.h"
30 #include "gdb_string.h"
34 /* How GDB's character set support works
36 GDB has three global settings:
38 - The `current host character set' is the character set GDB should
39 use in talking to the user, and which (hopefully) the user's
40 terminal knows how to display properly. Most users should not
43 - The `current target character set' is the character set the
44 program being debugged uses.
46 - The `current target wide character set' is the wide character set
47 the program being debugged uses, that is, the encoding used for
50 There are commands to set each of these, and mechanisms for
51 choosing reasonable default values. GDB has a global list of
52 character sets that it can use as its host or target character
55 The header file `charset.h' declares various functions that
56 different pieces of GDB need to perform tasks like:
58 - printing target strings and characters to the user's terminal
59 (mostly target->host conversions),
61 - building target-appropriate representations of strings and
62 characters the user enters in expressions (mostly host->target
67 To avoid excessive code duplication and maintenance efforts,
68 GDB simply requires a capable iconv function. Users on platforms
69 without a suitable iconv can use the GNU iconv library. */
74 /* Provide a phony iconv that does as little as possible. Also,
75 arrange for there to be a single available character set. */
77 #undef GDB_DEFAULT_HOST_CHARSET
78 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
80 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
81 #undef DEFAULT_CHARSET_NAMES
82 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
91 #define ICONV_CONST const
93 /* Some systems don't have EILSEQ, so we define it here, but not as
94 EINVAL, because callers of `iconv' want to distinguish EINVAL and
95 EILSEQ. This is what iconv.h from libiconv does as well. Note
96 that wchar.h may also define EILSEQ, so this needs to be after we
97 include wchar.h, which happens in defs.h through gdb_wchar.h. */
103 iconv_open (const char *to, const char *from)
105 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
106 We allow conversions to wchar_t and the host charset. */
107 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
108 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
110 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
113 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
114 used as a flag in calls to iconv. */
115 return !strcmp (from, "UCS-4BE");
119 iconv_close (iconv_t arg)
125 iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft,
126 char **outbuf, size_t *outbytesleft)
130 while (*inbytesleft >= 4)
135 for (j = 0; j < 4; ++j)
138 c += (*inbuf)[j] & 0xff;
153 if (*inbytesleft < 4)
161 /* In all other cases we simply copy input bytes to the
163 size_t amt = *inbytesleft;
164 if (amt > *outbytesleft)
166 memcpy (*outbuf, *inbuf, amt);
170 *outbytesleft -= amt;
179 /* The number of non-reversible conversions -- but they were all
188 /* The global lists of character sets and translations. */
191 #ifndef GDB_DEFAULT_TARGET_CHARSET
192 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
195 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
196 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
199 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
200 static const char *host_charset_name = "auto";
202 show_host_charset_name (struct ui_file *file, int from_tty,
203 struct cmd_list_element *c,
206 if (!strcmp (value, "auto"))
207 fprintf_filtered (file,
208 _("The host character set is \"auto; currently %s\".\n"),
209 auto_host_charset_name);
211 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
214 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
216 show_target_charset_name (struct ui_file *file, int from_tty,
217 struct cmd_list_element *c, const char *value)
219 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
223 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
225 show_target_wide_charset_name (struct ui_file *file, int from_tty,
226 struct cmd_list_element *c, const char *value)
228 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
232 static const char *default_charset_names[] =
234 DEFAULT_CHARSET_NAMES
238 static const char **charset_enum;
241 /* If the target wide character set has big- or little-endian
242 variants, these are the corresponding names. */
243 static const char *target_wide_charset_be_name;
244 static const char *target_wide_charset_le_name;
246 /* A helper function for validate which sets the target wide big- and
247 little-endian character set names, if possible. */
250 set_be_le_names (void)
254 target_wide_charset_le_name = NULL;
255 target_wide_charset_be_name = NULL;
257 len = strlen (target_wide_charset_name);
258 for (i = 0; charset_enum[i]; ++i)
260 if (strncmp (target_wide_charset_name, charset_enum[i], len))
262 if ((charset_enum[i][len] == 'B'
263 || charset_enum[i][len] == 'L')
264 && charset_enum[i][len + 1] == 'E'
265 && charset_enum[i][len + 2] == '\0')
267 if (charset_enum[i][len] == 'B')
268 target_wide_charset_be_name = charset_enum[i];
270 target_wide_charset_le_name = charset_enum[i];
275 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
276 target-wide-charset', 'set charset' sfunc's. */
282 const char *host_cset = host_charset ();
284 desc = iconv_open (target_wide_charset_name, host_cset);
285 if (desc == (iconv_t) -1)
286 error ("Cannot convert between character sets `%s' and `%s'",
287 target_wide_charset_name, host_cset);
290 desc = iconv_open (target_charset_name, host_cset);
291 if (desc == (iconv_t) -1)
292 error ("Cannot convert between character sets `%s' and `%s'",
293 target_charset_name, host_cset);
299 /* This is the sfunc for the 'set charset' command. */
301 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
303 /* CAREFUL: set the target charset here as well. */
304 target_charset_name = host_charset_name;
308 /* 'set host-charset' command sfunc. We need a wrapper here because
309 the function needs to have a specific signature. */
311 set_host_charset_sfunc (char *charset, int from_tty,
312 struct cmd_list_element *c)
317 /* Wrapper for the 'set target-charset' command. */
319 set_target_charset_sfunc (char *charset, int from_tty,
320 struct cmd_list_element *c)
325 /* Wrapper for the 'set target-wide-charset' command. */
327 set_target_wide_charset_sfunc (char *charset, int from_tty,
328 struct cmd_list_element *c)
333 /* sfunc for the 'show charset' command. */
335 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
338 show_host_charset_name (file, from_tty, c, host_charset_name);
339 show_target_charset_name (file, from_tty, c, target_charset_name);
340 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
344 /* Accessor functions. */
349 if (!strcmp (host_charset_name, "auto"))
350 return auto_host_charset_name;
351 return host_charset_name;
355 target_charset (void)
357 return target_charset_name;
361 target_wide_charset (enum bfd_endian byte_order)
363 if (byte_order == BFD_ENDIAN_BIG)
365 if (target_wide_charset_be_name)
366 return target_wide_charset_be_name;
370 if (target_wide_charset_le_name)
371 return target_wide_charset_le_name;
374 return target_wide_charset_name;
378 /* Host character set management. For the time being, we assume that
379 the host character set is some superset of ASCII. */
382 host_letter_to_control_character (char c)
389 /* Convert a host character, C, to its hex value. C must already have
390 been validated using isxdigit. */
393 host_hex_value (char c)
397 if (c >= 'a' && c <= 'f')
399 gdb_assert (c >= 'A' && c <= 'F');
404 /* Public character management functions. */
406 /* A cleanup function which is run to close an iconv descriptor. */
409 cleanup_iconv (void *p)
412 iconv_close (*descp);
416 convert_between_encodings (const char *from, const char *to,
417 const gdb_byte *bytes, unsigned int num_bytes,
418 int width, struct obstack *output,
419 enum transliterations translit)
422 struct cleanup *cleanups;
425 unsigned int space_request;
427 /* Often, the host and target charsets will be the same. */
428 if (!strcmp (from, to))
430 obstack_grow (output, bytes, num_bytes);
434 desc = iconv_open (to, from);
435 if (desc == (iconv_t) -1)
436 perror_with_name ("Converting character sets");
437 cleanups = make_cleanup (cleanup_iconv, &desc);
440 inp = (char *) bytes;
442 space_request = num_bytes;
450 old_size = obstack_object_size (output);
451 obstack_blank (output, space_request);
453 outp = obstack_base (output) + old_size;
454 outleft = space_request;
456 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
458 /* Now make sure that the object on the obstack only includes
459 bytes we have converted. */
460 obstack_blank (output, - (int) outleft);
462 if (r == (size_t) -1)
470 /* Invalid input sequence. */
471 if (translit == translit_none)
472 error (_("Could not convert character to `%s' character set"),
475 /* We emit escape sequence for the bytes, skip them,
477 for (i = 0; i < width; ++i)
481 sprintf (octal, "\\%.3o", *inp & 0xff);
482 obstack_grow_str (output, octal);
491 /* We ran out of space in the output buffer. Make it
492 bigger next time around. */
497 /* Incomplete input sequence. FIXME: ought to report this
498 to the caller somehow. */
503 perror_with_name ("Internal error while converting character sets");
508 do_cleanups (cleanups);
513 /* An iterator that returns host wchar_t's from a target string. */
514 struct wchar_iterator
516 /* The underlying iconv descriptor. */
519 /* The input string. This is updated as convert characters. */
521 /* The number of bytes remaining in the input. */
524 /* The width of an input character. */
527 /* The output buffer and its size. */
532 /* Create a new iterator. */
533 struct wchar_iterator *
534 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
537 struct wchar_iterator *result;
540 desc = iconv_open (INTERMEDIATE_ENCODING, charset);
541 if (desc == (iconv_t) -1)
542 perror_with_name ("Converting character sets");
544 result = XNEW (struct wchar_iterator);
546 result->input = (char *) input;
547 result->bytes = bytes;
548 result->width = width;
550 result->out = XNEW (gdb_wchar_t);
551 result->out_size = 1;
557 do_cleanup_iterator (void *p)
559 struct wchar_iterator *iter = p;
561 iconv_close (iter->desc);
567 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
569 return make_cleanup (do_cleanup_iterator, iter);
573 wchar_iterate (struct wchar_iterator *iter,
574 enum wchar_iterate_result *out_result,
575 gdb_wchar_t **out_chars,
576 const gdb_byte **ptr,
581 /* Try to convert some characters. At first we try to convert just
582 a single character. The reason for this is that iconv does not
583 necessarily update its outgoing arguments when it encounters an
584 invalid input sequence -- but we want to reliably report this to
585 our caller so it can emit an escape sequence. */
587 while (iter->bytes > 0)
589 char *outptr = (char *) &iter->out[0];
590 char *orig_inptr = iter->input;
591 size_t orig_in = iter->bytes;
592 size_t out_avail = out_request * sizeof (gdb_wchar_t);
596 size_t r = iconv (iter->desc,
597 (ICONV_CONST char **) &iter->input, &iter->bytes,
598 &outptr, &out_avail);
599 if (r == (size_t) -1)
604 /* Invalid input sequence. Skip it, and let the caller
606 *out_result = wchar_iterate_invalid;
609 iter->input += iter->width;
610 iter->bytes -= iter->width;
614 /* We ran out of space. We still might have converted a
615 character; if so, return it. Otherwise, grow the
616 buffer and try again. */
617 if (out_avail < out_request * sizeof (gdb_wchar_t))
621 if (out_request > iter->out_size)
623 iter->out_size = out_request;
624 iter->out = xrealloc (iter->out,
625 out_request * sizeof (gdb_wchar_t));
630 /* Incomplete input sequence. Let the caller know, and
631 arrange for future calls to see EOF. */
632 *out_result = wchar_iterate_incomplete;
639 perror_with_name ("Internal error while converting character sets");
643 /* We converted something. */
644 num = out_request - out_avail / sizeof (gdb_wchar_t);
645 *out_result = wchar_iterate_ok;
646 *out_chars = iter->out;
648 *len = orig_in - iter->bytes;
653 *out_result = wchar_iterate_eof;
658 /* The charset.c module initialization function. */
660 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
662 typedef char *char_ptr;
663 DEF_VEC_P (char_ptr);
665 static VEC (char_ptr) *charsets;
670 find_charset_names (void)
672 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
673 VEC_safe_push (char_ptr, charsets, NULL);
676 #else /* PHONY_ICONV */
678 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
679 provides different symbols in the static and dynamic libraries.
680 So, configure may see libiconvlist but not iconvlist. But, calling
681 iconvlist is the right thing to do and will work. Hence we do a
682 check here but unconditionally call iconvlist below. */
683 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
685 /* A helper function that adds some character sets to the vector of
686 all character sets. This is a callback function for iconvlist. */
689 add_one (unsigned int count, const char *const *names, void *data)
693 for (i = 0; i < count; ++i)
694 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
700 find_charset_names (void)
702 iconvlist (add_one, NULL);
703 VEC_safe_push (char_ptr, charsets, NULL);
709 find_charset_names (void)
711 struct pex_obj *child;
716 child = pex_init (0, "iconv", NULL);
721 /* Note that we simply ignore errors here. */
722 if (!pex_run (child, PEX_SEARCH | PEX_STDERR_TO_STDOUT, "iconv",
723 args, NULL, NULL, &err))
725 FILE *in = pex_read_output (child, 0);
727 /* POSIX says that iconv -l uses an unspecified format. We
728 parse the glibc and libiconv formats; feel free to add others
732 /* The size of buf is chosen arbitrarily. */
737 r = fgets (buf, sizeof (buf), in);
743 /* Strip off the newline. */
745 /* Strip off one or two '/'s. glibc will print lines like
746 "8859_7//", but also "10646-1:1993/UCS4/". */
747 if (buf[len - 1] == '/')
749 if (buf[len - 1] == '/')
753 /* libiconv will print multiple entries per line, separated
761 /* Find the next space, or end-of-line. */
762 for (p = start; *p && *p != ' '; ++p)
764 /* Ignore an empty result. */
769 VEC_safe_push (char_ptr, charsets, xstrdup (start));
772 /* Skip any extra spaces. */
773 for (start = p + 1; *start && *start == ' '; ++start)
778 if (pex_get_status (child, 1, &status)
779 && WIFEXITED (status) && !WEXITSTATUS (status))
788 /* Some error occurred, so drop the vector. */
791 for (ix = 0; VEC_iterate (char_ptr, charsets, ix, elt); ++ix)
793 VEC_truncate (char_ptr, charsets, 0);
796 VEC_safe_push (char_ptr, charsets, NULL);
799 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
800 #endif /* PHONY_ICONV */
803 _initialize_charset (void)
805 struct cmd_list_element *new_cmd;
807 /* The first element is always "auto"; then we skip it for the
808 commands where it is not allowed. */
809 VEC_safe_push (char_ptr, charsets, xstrdup ("auto"));
810 find_charset_names ();
812 if (VEC_length (char_ptr, charsets) > 1)
813 charset_enum = (const char **) VEC_address (char_ptr, charsets);
815 charset_enum = default_charset_names;
818 #ifdef HAVE_LANGINFO_CODESET
819 auto_host_charset_name = nl_langinfo (CODESET);
820 /* Solaris will return `646' here -- but the Solaris iconv then
821 does not accept this. */
822 if (!strcmp (auto_host_charset_name, "646"))
823 auto_host_charset_name = "ASCII";
824 target_charset_name = auto_host_charset_name;
830 add_setshow_enum_cmd ("charset", class_support,
831 &charset_enum[1], &host_charset_name, _("\
832 Set the host and target character sets."), _("\
833 Show the host and target character sets."), _("\
834 The `host character set' is the one used by the system GDB is running on.\n\
835 The `target character set' is the one used by the program being debugged.\n\
836 You may only use supersets of ASCII for your host character set; GDB does\n\
837 not support any others.\n\
838 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
839 /* Note that the sfunc below needs to set
840 target_charset_name, because the 'set
841 charset' command sets two variables. */
844 &setlist, &showlist);
846 add_setshow_enum_cmd ("host-charset", class_support,
847 charset_enum, &host_charset_name, _("\
848 Set the host character set."), _("\
849 Show the host character set."), _("\
850 The `host character set' is the one used by the system GDB is running on.\n\
851 You may only use supersets of ASCII for your host character set; GDB does\n\
852 not support any others.\n\
853 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
854 set_host_charset_sfunc,
855 show_host_charset_name,
856 &setlist, &showlist);
858 add_setshow_enum_cmd ("target-charset", class_support,
859 &charset_enum[1], &target_charset_name, _("\
860 Set the target character set."), _("\
861 Show the target character set."), _("\
862 The `target character set' is the one used by the program being debugged.\n\
863 GDB translates characters and strings between the host and target\n\
864 character sets as needed.\n\
865 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
866 set_target_charset_sfunc,
867 show_target_charset_name,
868 &setlist, &showlist);
870 add_setshow_enum_cmd ("target-wide-charset", class_support,
871 &charset_enum[1], &target_wide_charset_name,
873 Set the target wide character set."), _("\
874 Show the target wide character set."), _("\
875 The `target wide character set' is the one used by the program being debugged.\n\
876 In particular it is the encoding used by `wchar_t'.\n\
877 GDB translates characters and strings between the host and target\n\
878 character sets as needed.\n\
879 To see a list of the character sets GDB supports, type\n\
880 `set target-wide-charset'<TAB>"),
881 set_target_wide_charset_sfunc,
882 show_target_wide_charset_name,
883 &setlist, &showlist);