From ecb72bd5b640497474ceae8d5e991b88c0c032e2 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 19 Feb 2007 04:40:58 +0000 Subject: [PATCH] [BZ #2211] * stdio-common/vfscanf.c: Handle localized digits etc for floating point numbers. Patch mostly by Hamed Malek . --- ChangeLog | 5 + localedata/ChangeLog | 4 + localedata/Makefile | 6 +- localedata/tst-sscanf.c | 56 +++++++++++ stdio-common/vfscanf.c | 242 +++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 296 insertions(+), 17 deletions(-) create mode 100644 localedata/tst-sscanf.c diff --git a/ChangeLog b/ChangeLog index d898823..be2556b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2007-02-18 Ulrich Drepper + [BZ #2211] + * stdio-common/vfscanf.c: Handle localized digits etc for floating + point numbers. + Patch mostly by Hamed Malek . + * stdio-common/vfscanf.c: Fix problems in width accounting. * stdio-common/tst-sscanf.c (double_tests): New tests. (main): Hook them up. diff --git a/localedata/ChangeLog b/localedata/ChangeLog index 9dfe469..e1bc145 100644 --- a/localedata/ChangeLog +++ b/localedata/ChangeLog @@ -1,5 +1,9 @@ 2007-02-18 Ulrich Drepper + * Makefile (tests): Add tst-sscanf. + (LOCALES): Add fa_IR.UTF-8. + * tst-sscanf.c: New file. + * da_DK.in: Adjust for unified collation. * locales/vi_VN: Don't define HOK here as well. diff --git a/localedata/Makefile b/localedata/Makefile index db7094f..e518175 100644 --- a/localedata/Makefile +++ b/localedata/Makefile @@ -1,4 +1,4 @@ -# Copyright (C) 1996-2002, 2003, 2005 Free Software Foundation, Inc. +# Copyright (C) 1996-2002, 2003, 2005, 2007 Free Software Foundation, Inc. # This file is part of the GNU C Library. # The GNU C Library is free software; you can redistribute it and/or @@ -93,7 +93,7 @@ locale_test_suite := tst_iswalnum tst_iswalpha tst_iswcntrl \ tests = $(locale_test_suite) tst-digits tst-setlocale bug-iconv-trans \ tst-leaks tst-mbswcs6 tst-xlocale1 tst-xlocale2 bug-usesetlocale \ - tst-strfmon1 + tst-strfmon1 tst-sscanf ifeq (yes,$(build-shared)) ifneq (no,$(PERL)) tests: $(objpfx)mtrace-tst-leaks @@ -133,7 +133,7 @@ LOCALES := de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ANSI_X3.4-1968 \ en_US.ISO-8859-1 ja_JP.EUC-JP da_DK.ISO-8859-1 \ hr_HR.ISO-8859-2 sv_SE.ISO-8859-1 ja_JP.SJIS fr_FR.ISO-8859-1 \ vi_VN.TCVN5712-1 nb_NO.ISO-8859-1 nn_NO.ISO-8859-1 \ - tr_TR.UTF-8 cs_CZ.UTF-8 zh_TW.EUC-TW + tr_TR.UTF-8 cs_CZ.UTF-8 zh_TW.EUC-TW fa_IR.UTF-8 LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g') CHARMAPS := $(shell echo "$(LOCALES)" | \ sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g) diff --git a/localedata/tst-sscanf.c b/localedata/tst-sscanf.c new file mode 100644 index 0000000..89a77a1 --- /dev/null +++ b/localedata/tst-sscanf.c @@ -0,0 +1,56 @@ +#include +#include +#include + +#define P0 "\xDB\xB0" +#define P1 "\xDB\xB1" +#define P2 "\xDB\xB2" +#define P3 "\xDB\xB3" +#define P4 "\xDB\xB4" +#define P5 "\xDB\xB5" +#define P6 "\xDB\xB6" +#define P7 "\xDB\xB7" +#define P8 "\xDB\xB8" +#define P9 "\xDB\xB9" +#define PD "\xd9\xab" +#define PT "\xd9\xac" + +static int +check_sscanf (const char *s, const char *format, const float n) +{ + float f; + + if (sscanf (s, format, &f) != 1) + { + printf ("nothing found for \"%s\"\n", s); + return 1; + } + if (f != n) + { + printf ("got %f expected %f from \"%s\"\n", f, n, s); + return 1; + } + return 0; +} + +static int +do_test (void) +{ + if (setlocale (LC_ALL, "fa_IR") == NULL) + { + puts ("cannot set fa_IR locale"); + return 1; + } + + int r = check_sscanf (P3 PD P1 P4, "%I8f", 3.14); + r |= check_sscanf (P3 PT P1 P4 P5, "%I'f", 3145); + r |= check_sscanf (P3 PD P1 P4 P1 P5 P9, "%If", 3.14159); + r |= check_sscanf ("-" P3 PD P1 P4 P1 P5, "%If", -3.1415); + r |= check_sscanf ("+" PD P1 P4 P1 P5, "%If", +.1415); + r |= check_sscanf (P3 PD P1 P4 P1 P5 "e+" P2, "%Ie", 3.1415e+2); + + return r; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/stdio-common/vfscanf.c b/stdio-common/vfscanf.c index 90e7e36..cdb610d 100644 --- a/stdio-common/vfscanf.c +++ b/stdio-common/vfscanf.c @@ -1,5 +1,4 @@ -/* Copyright (C) 1991-2002, 2003, 2004, 2005, 2006, 2007 - Free Software Foundation, Inc. +/* Copyright (C) 1991-2006, 2007 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -1264,13 +1263,13 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, mbdigits[n] = strchr (mbdigits[n], '\0') + 1; cmpp = mbdigits[n]; - while ((unsigned char) *cmpp == c && avail > 0) + while ((unsigned char) *cmpp == c && avail >= 0) { if (*++cmpp == '\0') break; else { - if ((c = inchar ()) == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1317,13 +1316,13 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, int avail = width > 0 ? width : INT_MAX; cmpp = mbdigits[n]; - while ((unsigned char) *cmpp == c && avail > 0) + while ((unsigned char) *cmpp == c && avail >= 0) { if (*++cmpp == '\0') break; else { - if ((c = inchar ()) == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1378,14 +1377,14 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, const char *cmpp = thousands; int avail = width > 0 ? width : INT_MAX; - while ((unsigned char) *cmpp == c && avail > 0) + while ((unsigned char) *cmpp == c && avail >= 0) { ADDW (c); if (*++cmpp == '\0') break; else { - if ((c = inchar ()) == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1450,14 +1449,14 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, const char *cmpp = thousands; int avail = width > 0 ? width : INT_MAX; - while ((unsigned char) *cmpp == c && avail > 0) + while ((unsigned char) *cmpp == c && avail >= 0) { ADDW (c); if (*++cmpp == '\0') break; else { - if ((c = inchar ()) == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1753,12 +1752,12 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, if (! got_dot) { - while ((unsigned char) *cmpp == c && avail > 0) + while ((unsigned char) *cmpp == c && avail >= 0) if (*++cmpp == '\0') break; else { - if (inchar () == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1790,12 +1789,12 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, ++cmp2p; if (cmp2p - thousands == cmpp - decimal) { - while ((unsigned char) *cmp2p == c && avail > 0) + while ((unsigned char) *cmp2p == c && avail >= 0) if (*++cmp2p == '\0') break; else { - if (inchar () == EOF) + if (avail == 0 || inchar () == EOF) break; --avail; } @@ -1828,6 +1827,221 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr, --width; } + wctrans_t map; + if (__builtin_expect ((flags & I18N) != 0, 0) + /* Hexadecimal floats make no sense, fixing localized + digits with ASCII letters. */ + && !is_hexa + /* Minimum requirement. */ + && (wpsize == 0 || got_dot) + && (map = __wctrans ("to_inpunct")) != NULL) + { + /* Reget the first character. */ + inchar (); + + /* Localized digits, decimal points, and thousands + separator. */ + wint_t wcdigits[12]; + + /* First get decimal equivalent to check if we read it + or not. */ + wcdigits[11] = __towctrans (L'.', map); + + /* If we have not read any character or have just read + locale decimal point which matches the decimal point + for localized FP numbers, then we may have localized + digits. Note, we test GOT_DOT above. */ +#ifdef COMPILE_WSCANF + if (wpsize == 0 || (wpsize == 1 && wcdigits[11] == decimal)) +#else + char mbdigits[12][MB_LEN_MAX + 1]; + + mbstate_t state; + memset (&state, '\0', sizeof (state)); + + bool match_so_far = wpsize == 0; + size_t mblen = __wcrtomb (mbdigits[11], wcdigits[11], &state); + if (mblen != (size_t) -1) + { + mbdigits[11][mblen] = '\0'; + match_so_far |= (wpsize == strlen (decimal) + && strcmp (decimal, mbdigits[11]) == 0); + } + else + { + size_t decimal_len = strlen (decimal); + /* This should always be the case but the data comes + from a file. */ + if (decimal_len <= MB_LEN_MAX) + { + match_so_far |= wpsize == decimal_len; + memcpy (mbdigits[11], decimal, decimal_len + 1); + } + else + match_so_far = false; + } + + if (match_so_far) +#endif + { + int have_locthousands = true; + /* Now get the digits and the thousands-sep equivalents. */ + for (int n = 0; n < 11; ++n) + { + if (n < 10) + wcdigits[n] = __towctrans (L'0' + n, map); + else if (n == 10) + wcdigits[10] = __towctrans (L',', map); + +#ifndef COMPILE_WSCANF + memset (&state, '\0', sizeof (state)); + + size_t mblen = __wcrtomb (mbdigits[n], wcdigits[n], + &state); + if (mblen == (size_t) -1) + { + if (n == 10) + { + if (thousands == NULL || (flags & GROUP) == 0) + have_locthousands = false; + else + { + size_t thousands_len = strlen (thousands); + if (thousands_len <= MB_LEN_MAX) + memcpy (mbdigits[10], thousands, + thousands_len + 1); + else + have_locthousands = false; + } + } + else + /* Ignore checking against localized digits. */ + goto no_i18nflt; + } + else + mbdigits[n][mblen] = '\0'; +#endif + } + + /* Start checking against localized digits, if + convertion is done correctly. */ + while (1) + { + if (got_e && wp[wpsize - 1] == exp_char + && (c == L_('-') || c == L_('+'))) + ADDW (c); + else if (wpsize > 0 && !got_e + && (CHAR_T) TOLOWER (c) == exp_char) + { + ADDW (exp_char); + got_e = got_dot = 1; + } + else + { + /* Check against localized digits, decimal point, + and thousands separator. */ + int n; + for (n = 0; n < 12; ++n) + { +#ifdef COMPILE_WSCANF + if (c == wcdigits[n]) + { + if (n < 10) + ADDW (L_('0') + n); + else if (n == 11 && !got_dot) + { + ADDW (decimal); + got_dot = 1; + } + else if (n == 10 && have_locthousands + && ! got_dot) + ADDW (thousands); + else + /* The last read character is not part + of the number anymore. */ + n = 12; + + break; + } +#else + const char *cmpp = mbdigits[n]; + int avail = width > 0 ? width : INT_MAX; + + while ((unsigned char) *cmpp == c && avail >= 0) + if (*++cmpp == '\0') + break; + else + { + if (avail == 0 || inchar () == EOF) + break; + --avail; + } + if (*cmpp == '\0') + { + if (width > 0) + width = avail; + + if (n < 10) + ADDW (L_('0') + n); + else if (n == 11 && !got_dot) + { + /* Add all the characters. */ + for (cmpp = decimal; *cmpp != '\0'; + ++cmpp) + ADDW ((unsigned char) *cmpp); + + got_dot = 1; + } + else if (n == 10 && (flags & GROUP) != 0 + && thousands != NULL && ! got_dot) + { + /* Add all the characters. */ + for (cmpp = thousands; *cmpp != '\0'; + ++cmpp) + ADDW ((unsigned char) *cmpp); + } + else + /* The last read character is not part + of the number anymore. */ + n = 12; + + break; + } + + /* We are pushing all read characters back. */ + if (cmpp > mbdigits[n]) + { + ungetc (c, s); + while (--cmpp > mbdigits[n]) + ungetc_not_eof ((unsigned char) *cmpp, s); + c = (unsigned char) *cmpp; + } +#endif + } + + if (n >= 12) + { + /* The last read character is not part + of the number anymore. */ + ungetc (c, s); + break; + } + } + + if (width == 0 || inchar () == EOF) + break; + + if (width > 0) + --width; + } + } + +#ifndef COMPILE_WSCANF + no_i18nflt: + ; +#endif + } + /* Have we read any character? If we try to read a number in hexadecimal notation and we have read only the `0x' prefix or no exponent this is an error. */ -- 2.7.4