1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
25 #include "charset-list.h"
29 #include "gdb_string.h"
33 /* How GDB's character set support works
35 GDB has three global settings:
37 - The `current host character set' is the character set GDB should
38 use in talking to the user, and which (hopefully) the user's
39 terminal knows how to display properly. Most users should not
42 - The `current target character set' is the character set the
43 program being debugged uses.
45 - The `current target wide character set' is the wide character set
46 the program being debugged uses, that is, the encoding used for
49 There are commands to set each of these, and mechanisms for
50 choosing reasonable default values. GDB has a global list of
51 character sets that it can use as its host or target character
54 The header file `charset.h' declares various functions that
55 different pieces of GDB need to perform tasks like:
57 - printing target strings and characters to the user's terminal
58 (mostly target->host conversions),
60 - building target-appropriate representations of strings and
61 characters the user enters in expressions (mostly host->target
66 To avoid excessive code duplication and maintenance efforts,
67 GDB simply requires a capable iconv function. Users on platforms
68 without a suitable iconv can use the GNU iconv library. */
73 /* Provide a phony iconv that does as little as possible. Also,
74 arrange for there to be a single available character set. */
76 #undef GDB_DEFAULT_HOST_CHARSET
77 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
78 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
80 #undef DEFAULT_CHARSET_NAMES
81 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
90 #define ICONV_CONST const
92 /* Some systems don't have EILSEQ, so we define it here, but not as
93 EINVAL, because callers of `iconv' want to distinguish EINVAL and
94 EILSEQ. This is what iconv.h from libiconv does as well. Note
95 that wchar.h may also define EILSEQ, so this needs to be after we
96 include wchar.h, which happens in defs.h through gdb_wchar.h. */
102 iconv_open (const char *to, const char *from)
104 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
105 We allow conversions to wchar_t and the host charset. */
106 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
107 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
109 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
112 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
113 used as a flag in calls to iconv. */
114 return !strcmp (from, "UCS-4BE");
118 iconv_close (iconv_t arg)
124 iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft,
125 char **outbuf, size_t *outbytesleft)
129 while (*inbytesleft >= 4)
134 for (j = 0; j < 4; ++j)
137 c += (*inbuf)[j] & 0xff;
152 if (*inbytesleft < 4)
160 /* In all other cases we simply copy input bytes to the
162 size_t amt = *inbytesleft;
163 if (amt > *outbytesleft)
165 memcpy (*outbuf, *inbuf, amt);
169 *outbytesleft -= amt;
178 /* The number of non-reversible conversions -- but they were all
187 /* The global lists of character sets and translations. */
190 #ifndef GDB_DEFAULT_TARGET_CHARSET
191 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
194 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
195 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
198 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
199 static const char *host_charset_name = "auto";
201 show_host_charset_name (struct ui_file *file, int from_tty,
202 struct cmd_list_element *c,
205 if (!strcmp (value, "auto"))
206 fprintf_filtered (file,
207 _("The host character set is \"auto; currently %s\".\n"),
208 auto_host_charset_name);
210 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
213 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
215 show_target_charset_name (struct ui_file *file, int from_tty,
216 struct cmd_list_element *c, const char *value)
218 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
222 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
224 show_target_wide_charset_name (struct ui_file *file, int from_tty,
225 struct cmd_list_element *c, const char *value)
227 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
231 static const char *default_charset_names[] =
233 DEFAULT_CHARSET_NAMES
237 static const char **charset_enum;
240 /* If the target wide character set has big- or little-endian
241 variants, these are the corresponding names. */
242 static const char *target_wide_charset_be_name;
243 static const char *target_wide_charset_le_name;
245 /* A helper function for validate which sets the target wide big- and
246 little-endian character set names, if possible. */
249 set_be_le_names (void)
253 target_wide_charset_le_name = NULL;
254 target_wide_charset_be_name = NULL;
256 len = strlen (target_wide_charset_name);
257 for (i = 0; charset_enum[i]; ++i)
259 if (strncmp (target_wide_charset_name, charset_enum[i], len))
261 if ((charset_enum[i][len] == 'B'
262 || charset_enum[i][len] == 'L')
263 && charset_enum[i][len + 1] == 'E'
264 && charset_enum[i][len + 2] == '\0')
266 if (charset_enum[i][len] == 'B')
267 target_wide_charset_be_name = charset_enum[i];
269 target_wide_charset_le_name = charset_enum[i];
274 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
275 target-wide-charset', 'set charset' sfunc's. */
281 const char *host_cset = host_charset ();
283 desc = iconv_open (target_wide_charset_name, host_cset);
284 if (desc == (iconv_t) -1)
285 error ("Cannot convert between character sets `%s' and `%s'",
286 target_wide_charset_name, host_cset);
289 desc = iconv_open (target_charset_name, host_cset);
290 if (desc == (iconv_t) -1)
291 error ("Cannot convert between character sets `%s' and `%s'",
292 target_charset_name, host_cset);
298 /* This is the sfunc for the 'set charset' command. */
300 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
302 /* CAREFUL: set the target charset here as well. */
303 target_charset_name = host_charset_name;
307 /* 'set host-charset' command sfunc. We need a wrapper here because
308 the function needs to have a specific signature. */
310 set_host_charset_sfunc (char *charset, int from_tty,
311 struct cmd_list_element *c)
316 /* Wrapper for the 'set target-charset' command. */
318 set_target_charset_sfunc (char *charset, int from_tty,
319 struct cmd_list_element *c)
324 /* Wrapper for the 'set target-wide-charset' command. */
326 set_target_wide_charset_sfunc (char *charset, int from_tty,
327 struct cmd_list_element *c)
332 /* sfunc for the 'show charset' command. */
334 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
337 show_host_charset_name (file, from_tty, c, host_charset_name);
338 show_target_charset_name (file, from_tty, c, target_charset_name);
339 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
343 /* Accessor functions. */
348 if (!strcmp (host_charset_name, "auto"))
349 return auto_host_charset_name;
350 return host_charset_name;
354 target_charset (void)
356 return target_charset_name;
360 target_wide_charset (void)
362 if (gdbarch_byte_order (current_gdbarch) == BFD_ENDIAN_BIG)
364 if (target_wide_charset_be_name)
365 return target_wide_charset_be_name;
369 if (target_wide_charset_le_name)
370 return target_wide_charset_le_name;
373 return target_wide_charset_name;
377 /* Host character set management. For the time being, we assume that
378 the host character set is some superset of ASCII. */
381 host_letter_to_control_character (char c)
388 /* Convert a host character, C, to its hex value. C must already have
389 been validated using isxdigit. */
392 host_hex_value (char c)
396 if (c >= 'a' && c <= 'f')
398 gdb_assert (c >= 'A' && c <= 'F');
403 /* Public character management functions. */
405 /* A cleanup function which is run to close an iconv descriptor. */
408 cleanup_iconv (void *p)
411 iconv_close (*descp);
415 convert_between_encodings (const char *from, const char *to,
416 const gdb_byte *bytes, unsigned int num_bytes,
417 int width, struct obstack *output,
418 enum transliterations translit)
421 struct cleanup *cleanups;
424 unsigned int space_request;
426 /* Often, the host and target charsets will be the same. */
427 if (!strcmp (from, to))
429 obstack_grow (output, bytes, num_bytes);
433 desc = iconv_open (to, from);
434 if (desc == (iconv_t) -1)
435 perror_with_name ("Converting character sets");
436 cleanups = make_cleanup (cleanup_iconv, &desc);
439 inp = (char *) bytes;
441 space_request = num_bytes;
449 old_size = obstack_object_size (output);
450 obstack_blank (output, space_request);
452 outp = obstack_base (output) + old_size;
453 outleft = space_request;
455 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
457 /* Now make sure that the object on the obstack only includes
458 bytes we have converted. */
459 obstack_blank (output, - (int) outleft);
461 if (r == (size_t) -1)
469 /* Invalid input sequence. */
470 if (translit == translit_none)
471 error (_("Could not convert character to `%s' character set"),
474 /* We emit escape sequence for the bytes, skip them,
476 for (i = 0; i < width; ++i)
480 sprintf (octal, "\\%.3o", *inp & 0xff);
481 obstack_grow_str (output, octal);
490 /* We ran out of space in the output buffer. Make it
491 bigger next time around. */
496 /* Incomplete input sequence. FIXME: ought to report this
497 to the caller somehow. */
502 perror_with_name ("Internal error while converting character sets");
507 do_cleanups (cleanups);
512 /* An iterator that returns host wchar_t's from a target string. */
513 struct wchar_iterator
515 /* The underlying iconv descriptor. */
518 /* The input string. This is updated as convert characters. */
520 /* The number of bytes remaining in the input. */
523 /* The width of an input character. */
526 /* The output buffer and its size. */
531 /* Create a new iterator. */
532 struct wchar_iterator *
533 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
536 struct wchar_iterator *result;
539 desc = iconv_open ("wchar_t", charset);
540 if (desc == (iconv_t) -1)
541 perror_with_name ("Converting character sets");
543 result = XNEW (struct wchar_iterator);
545 result->input = (char *) input;
546 result->bytes = bytes;
547 result->width = width;
549 result->out = XNEW (gdb_wchar_t);
550 result->out_size = 1;
556 do_cleanup_iterator (void *p)
558 struct wchar_iterator *iter = p;
560 iconv_close (iter->desc);
566 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
568 return make_cleanup (do_cleanup_iterator, iter);
572 wchar_iterate (struct wchar_iterator *iter,
573 enum wchar_iterate_result *out_result,
574 gdb_wchar_t **out_chars,
575 const gdb_byte **ptr,
580 /* Try to convert some characters. At first we try to convert just
581 a single character. The reason for this is that iconv does not
582 necessarily update its outgoing arguments when it encounters an
583 invalid input sequence -- but we want to reliably report this to
584 our caller so it can emit an escape sequence. */
586 while (iter->bytes > 0)
588 char *outptr = (char *) &iter->out[0];
589 char *orig_inptr = iter->input;
590 size_t orig_in = iter->bytes;
591 size_t out_avail = out_request * sizeof (gdb_wchar_t);
595 size_t r = iconv (iter->desc,
596 (ICONV_CONST char **) &iter->input, &iter->bytes,
597 &outptr, &out_avail);
598 if (r == (size_t) -1)
603 /* Invalid input sequence. Skip it, and let the caller
605 *out_result = wchar_iterate_invalid;
608 iter->input += iter->width;
609 iter->bytes -= iter->width;
613 /* We ran out of space. We still might have converted a
614 character; if so, return it. Otherwise, grow the
615 buffer and try again. */
616 if (out_avail < out_request * sizeof (gdb_wchar_t))
620 if (out_request > iter->out_size)
622 iter->out_size = out_request;
623 iter->out = xrealloc (iter->out,
624 out_request * sizeof (gdb_wchar_t));
629 /* Incomplete input sequence. Let the caller know, and
630 arrange for future calls to see EOF. */
631 *out_result = wchar_iterate_incomplete;
638 perror_with_name ("Internal error while converting character sets");
642 /* We converted something. */
643 num = out_request - out_avail / sizeof (gdb_wchar_t);
644 *out_result = wchar_iterate_ok;
645 *out_chars = iter->out;
647 *len = orig_in - iter->bytes;
652 *out_result = wchar_iterate_eof;
657 /* The charset.c module initialization function. */
659 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
661 typedef char *char_ptr;
662 DEF_VEC_P (char_ptr);
664 static VEC (char_ptr) *charsets;
669 find_charset_names (void)
671 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
672 VEC_safe_push (char_ptr, charsets, NULL);
675 #else /* PHONY_ICONV */
677 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
678 provides different symbols in the static and dynamic libraries.
679 So, configure may see libiconvlist but not iconvlist. But, calling
680 iconvlist is the right thing to do and will work. Hence we do a
681 check here but unconditionally call iconvlist below. */
682 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
684 /* A helper function that adds some character sets to the vector of
685 all character sets. This is a callback function for iconvlist. */
688 add_one (unsigned int count, const char *const *names, void *data)
692 for (i = 0; i < count; ++i)
693 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
699 find_charset_names (void)
701 iconvlist (add_one, NULL);
702 VEC_safe_push (char_ptr, charsets, NULL);
708 find_charset_names (void)
712 in = popen ("iconv -l", "r");
713 /* It is ok to ignore errors; we'll fall back on a default. */
717 /* POSIX says that iconv -l uses an unspecified format. We parse
718 the glibc format; feel free to add others as needed. */
721 /* The size of buf is chosen arbitrarily. A character set name
722 longer than this would not be very nice. */
725 char *r = fgets (buf, sizeof (buf), in);
731 if (buf[len - 2] == '/' && buf[len - 3] == '/')
733 VEC_safe_push (char_ptr, charsets, xstrdup (buf));
738 VEC_safe_push (char_ptr, charsets, NULL);
741 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
742 #endif /* PHONY_ICONV */
745 _initialize_charset (void)
747 struct cmd_list_element *new_cmd;
749 /* The first element is always "auto"; then we skip it for the
750 commands where it is not allowed. */
751 VEC_safe_push (char_ptr, charsets, "auto");
752 find_charset_names ();
754 if (VEC_length (char_ptr, charsets) > 1)
755 charset_enum = (const char **) VEC_address (char_ptr, charsets);
757 charset_enum = default_charset_names;
760 #ifdef HAVE_LANGINFO_CODESET
761 auto_host_charset_name = nl_langinfo (CODESET);
762 target_charset_name = auto_host_charset_name;
768 add_setshow_enum_cmd ("charset", class_support,
769 &charset_enum[1], &host_charset_name, _("\
770 Set the host and target character sets."), _("\
771 Show the host and target character sets."), _("\
772 The `host character set' is the one used by the system GDB is running on.\n\
773 The `target character set' is the one used by the program being debugged.\n\
774 You may only use supersets of ASCII for your host character set; GDB does\n\
775 not support any others.\n\
776 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
777 /* Note that the sfunc below needs to set
778 target_charset_name, because the 'set
779 charset' command sets two variables. */
782 &setlist, &showlist);
784 add_setshow_enum_cmd ("host-charset", class_support,
785 charset_enum, &host_charset_name, _("\
786 Set the host character set."), _("\
787 Show the host character set."), _("\
788 The `host character set' is the one used by the system GDB is running on.\n\
789 You may only use supersets of ASCII for your host character set; GDB does\n\
790 not support any others.\n\
791 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
792 set_host_charset_sfunc,
793 show_host_charset_name,
794 &setlist, &showlist);
796 add_setshow_enum_cmd ("target-charset", class_support,
797 &charset_enum[1], &target_charset_name, _("\
798 Set the target character set."), _("\
799 Show the target character set."), _("\
800 The `target character set' is the one used by the program being debugged.\n\
801 GDB translates characters and strings between the host and target\n\
802 character sets as needed.\n\
803 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
804 set_target_charset_sfunc,
805 show_target_charset_name,
806 &setlist, &showlist);
808 add_setshow_enum_cmd ("target-wide-charset", class_support,
809 &charset_enum[1], &target_wide_charset_name,
811 Set the target wide character set."), _("\
812 Show the target wide character set."), _("\
813 The `target wide character set' is the one used by the program being debugged.\n\
814 In particular it is the encoding used by `wchar_t'.\n\
815 GDB translates characters and strings between the host and target\n\
816 character sets as needed.\n\
817 To see a list of the character sets GDB supports, type\n\
818 `set target-wide-charset'<TAB>"),
819 set_target_wide_charset_sfunc,
820 show_target_wide_charset_name,
821 &setlist, &showlist);