More char16_t and char32_t support
authorUlrich Drepper <drepper@gmail.com>
Sat, 7 Jan 2012 15:52:53 +0000 (10:52 -0500)
committerUlrich Drepper <drepper@gmail.com>
Sat, 7 Jan 2012 15:52:53 +0000 (10:52 -0500)
It works now for UTF-8 locales

ChangeLog
iconv/gconv_builtin.h
iconv/gconv_int.h
iconv/gconv_simple.c
iconv/iconv_prog.c
wcsmbs/Makefile
wcsmbs/c16rtomb.c
wcsmbs/mbrtoc16.c
wcsmbs/tst-c16c32-1.c [new file with mode: 0644]
wcsmbs/wcrtomb.c
wcsmbs/wcsmbsload.c

index f089e19..2c0b0f8 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
 2012-01-07  Ulrich Drepper  <drepper@gmail.com>
 
+       * iconv/gconv_builtin.h: Use CHAR16 for the char16_t conversions.
+       * iconv/gconv_simple.c: Rename char16_t routines.  Add char16_t<->utf8
+       routines.
+       * iconv/gconv_int.h: Adjust prototypes for char16_t routines.
+       * iconv/iconv_prog.c: Recognize CHAR16 as internal name.
+       * wcsmbs/c16rtomb.c: Fix a few problems.  Disable all the code and
+       fall back to using wcrtomb.
+       * wcsmbs/mbrtoc16.: Fix implementation to handle real conversions.
+       * wcsmbs/wcsmbsload.c: Make char16 routines optional.  Adjust for
+       renaming.
+       * wcsmbs/Makefile (tests): Add tst-c16c32-1:
+       * wcsmbs/tst-c16c32-1.c: New file.
+
+       * wcsmbs/wcrtomb.c: Use MB_LEN_MAX instead of MB_CUR_MAX for sizing
+       local variable.
+
        * libio/stdio.h: Do not declare gets at all for _GNU_SOURCE.
 
        * elf/tst-unique3.cc: Add explicit declaration of gets.
index fd736a4..6820f82 100644 (file)
@@ -1,5 +1,5 @@
 /* Builtin transformations.
-   Copyright (C) 1997-1999, 2000-2002, 2006, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-1999, 2000-2002, 2006, 2011, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
@@ -124,22 +124,15 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1,
 #endif
 
 
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16//", 1, "=ascii->UTF-16",
-                       __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "CHAR16", 1, "=ascii->CHAR16",
+                       __gconv_transform_ascii_char16, NULL, 1, 1, 2, 4)
 
-BUILTIN_TRANSFORMATION ("UTF-16//", "ANSI_X3.4-1968//", 1, "=UTF-16->ascii",
-                       __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("CHAR16", "ANSI_X3.4-1968//", 1, "=CHAR16->ascii",
+                       __gconv_transform_char16_ascii, NULL, 2, 4, 1, 1)
 
-#if BYTE_ORDER == BIG_ENDIAN
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16BE//", 1, "=ascii->UTF-16BE",
-                       __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
 
-BUILTIN_TRANSFORMATION ("UTF-16BE//", "ANSI_X3.4-1968//", 1, "=UTF-16BE->ascii",
-                       __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
-#else
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16LE//", 1, "=ascii->UTF-16LE",
-                       __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "CHAR16", 1, "=utf8->CHAR16",
+                       __gconv_transform_utf8_char16, NULL, 1, 6, 2, 4)
 
-BUILTIN_TRANSFORMATION ("UTF-16LE//", "ANSI_X3.4-1968//", 1, "=UTF-16LE->ascii",
-                       __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
-#endif
+BUILTIN_TRANSFORMATION ("CHAR16", "ISO-10646/UTF8/", 1, "=CHAR16->utf8",
+                       __gconv_transform_char16_utf8, NULL, 2, 4, 1, 6)
index 80253dd..79de975 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2005, 2006, 2007, 2011 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2005, 2006, 2007, 2011, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
@@ -303,8 +303,10 @@ __BUILTIN_TRANSFORM (__gconv_transform_internal_ucs4le);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs4le_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf16);
 __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
-__BUILTIN_TRANSFORM (__gconv_transform_ascii_utf16);
-__BUILTIN_TRANSFORM (__gconv_transform_utf16_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_ascii_char16);
+__BUILTIN_TRANSFORM (__gconv_transform_char16_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_utf8_char16);
+__BUILTIN_TRANSFORM (__gconv_transform_char16_utf8);
 # undef __BUITLIN_TRANSFORM
 
 /* Specialized conversion function for a single byte to INTERNAL, recognizing
index b0ef3e6..d145a3e 100644 (file)
@@ -1,5 +1,5 @@
 /* Simple transformations functions.
-   Copyright (C) 1997-2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
@@ -1065,6 +1065,7 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
                                                                              \
     state->__count = inend - *inptrp;                                        \
                                                                              \
+    assert (ch != 0xc0 && ch != 0xc1);                                       \
     if (ch >= 0xc2 && ch < 0xe0)                                             \
       {                                                                              \
        /* We expect two bytes.  The first byte cannot be 0xc0 or             \
@@ -1322,15 +1323,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
 #include <iconv/skeleton.c>
 
 
-/* Convert from ISO 646-IRV to UTF-16.  */
+/* Convert from ISO 646-IRV to the char16_t format.  */
 #define DEFINE_INIT            0
 #define DEFINE_FINI            0
 #define MIN_NEEDED_FROM                1
 #define MIN_NEEDED_TO          2
 #define FROM_DIRECTION         1
-#define FROM_LOOP              ascii_utf16_loop
-#define TO_LOOP                        ascii_utf16_loop /* This is not used.  */
-#define FUNCTION_NAME          __gconv_transform_ascii_utf16
+#define FROM_LOOP              ascii_char16_loop
+#define TO_LOOP                        ascii_char16_loop /* This is not used.  */
+#define FUNCTION_NAME          __gconv_transform_ascii_char16
 #define ONE_DIRECTION          1
 
 #define MIN_NEEDED_INPUT       MIN_NEEDED_FROM
@@ -1358,15 +1359,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
 #include <iconv/skeleton.c>
 
 
-/* Convert from UTF-16 to ISO 646-IRV.  */
+/* Convert from the char16_t format to ISO 646-IRV.  */
 #define DEFINE_INIT            0
 #define DEFINE_FINI            0
 #define MIN_NEEDED_FROM                2
 #define MIN_NEEDED_TO          1
 #define FROM_DIRECTION         1
-#define FROM_LOOP              utf16_ascii_loop
-#define TO_LOOP                        utf16_ascii_loop /* This is not used.  */
-#define FUNCTION_NAME          __gconv_transform_utf16_ascii
+#define FROM_LOOP              char16_ascii_loop
+#define TO_LOOP                        char16_ascii_loop /* This is not used.  */
+#define FUNCTION_NAME          __gconv_transform_char16_ascii
 #define ONE_DIRECTION          1
 
 #define MIN_NEEDED_INPUT       MIN_NEEDED_FROM
@@ -1383,9 +1384,328 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
       {                                                                              \
        /* It's an one byte sequence.  */                                     \
        *outptr++ = *((const uint16_t *) inptr);                              \
-       inptr += sizeof (uint16_t);                                           \
+       inptr += 2;                                                           \
+      }                                                                              \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the char16_t format to UTF-8.  */
+#define DEFINE_INIT            0
+#define DEFINE_FINI            0
+#define MIN_NEEDED_FROM                2
+#define MAX_NEEDED_FROM                4
+#define MIN_NEEDED_TO          1
+#define MAX_NEEDED_TO          6
+#define FROM_DIRECTION         1
+#define FROM_LOOP              char16_utf8_loop
+#define TO_LOOP                        char16_utf8_loop /* This is not used.  */
+#define FUNCTION_NAME          __gconv_transform_char16_utf8
+#define ONE_DIRECTION          1
+
+#define MIN_NEEDED_INPUT       MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT       MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT      MIN_NEEDED_TO
+#define MAX_NEEDED_OUTPUT      MAX_NEEDED_TO
+#define LOOPFCT                        FROM_LOOP
+#define BODY \
+  {                                                                          \
+    /* Yes, reading a 16-bit number and storing it as 32-bit is correct.  */  \
+    uint32_t wc = *((const uint16_t *) inptr);                               \
+    inptr += 2;                                                                      \
+                                                                             \
+    if (__builtin_expect (wc < 0x80, 1))                                     \
+      /* It's an one byte sequence.  */                                              \
+      *outptr++ = (unsigned char) wc;                                        \
+    else                                                                     \
+      {                                                                              \
+       size_t step;                                                          \
+                                                                             \
+       if (__builtin_expect (wc < 0xd800 || wc > 0xdfff, 1))                 \
+         step = wc < 0x800 ? 2 : 3;                                          \
+       else                                                                  \
+         {                                                                   \
+           if (__builtin_expect (inptr + 2 > inend, 0))                      \
+             {                                                               \
+               /* We don't have enough input for another complete input      \
+                  character.  */                                             \
+               inptr -= 2;                                                   \
+               result = __GCONV_INCOMPLETE_INPUT;                            \
+               break;                                                        \
+             }                                                               \
+                                                                             \
+           uint32_t sec = *((const uint16_t *) inptr);                       \
+           if (__builtin_expect (sec < 0xdc00, 0)                            \
+               || __builtin_expect (sec > 0xdfff, 0))                        \
+             {                                                               \
+               /* This is no valid second word for a surrogate.  */          \
+               STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
+             }                                                               \
+           inptr += 2;                                                       \
+           wc = ((wc - 0xd7c0) << 10) + (sec - 0xdc00);                      \
+                                                                             \
+           step = wc < 0x200000 ? 4 : 5;                                     \
+         }                                                                   \
+                                                                             \
+       if (__builtin_expect (outptr + step > outend, 0))                     \
+         {                                                                   \
+           /* Too long.  */                                                  \
+           result = __GCONV_FULL_OUTPUT;                                     \
+           inptr -= step >= 4 ? 4 : 2;                                       \
+           break;                                                            \
+         }                                                                   \
+                                                                             \
+       unsigned char *start = outptr;                                        \
+       *outptr = (unsigned char) (~0xff >> step);                            \
+       outptr += step;                                                       \
+       do                                                                    \
+         {                                                                   \
+           start[--step] = 0x80 | (wc & 0x3f);                               \
+           wc >>= 6;                                                         \
+         }                                                                   \
+       while (step > 1);                                                     \
+       start[0] |= wc;                                                       \
       }                                                                              \
   }
 #define LOOP_NEED_FLAGS
 #include <iconv/loop.c>
 #include <iconv/skeleton.c>
+
+
+/* Convert from UTF-8 to the char16_t format.  */
+#define DEFINE_INIT            0
+#define DEFINE_FINI            0
+#define MIN_NEEDED_FROM                1
+#define MAX_NEEDED_FROM                6
+#define MIN_NEEDED_TO          2
+#define MAX_NEEDED_TO          4
+#define FROM_DIRECTION         1
+#define FROM_LOOP              utf8_char16_loop
+#define TO_LOOP                        utf8_char16_loop /* This is not used.  */
+#define FUNCTION_NAME          __gconv_transform_utf8_char16
+#define ONE_DIRECTION          1
+
+#define MIN_NEEDED_INPUT       MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT       MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT      MIN_NEEDED_TO
+#define LOOPFCT                        FROM_LOOP
+#define BODY \
+  {                                                                          \
+    /* Next input byte.  */                                                  \
+    uint32_t ch = *inptr;                                                    \
+                                                                             \
+    if (__builtin_expect (ch < 0x80, 1))                                     \
+      {                                                                              \
+       /* One byte sequence.  */                                             \
+       *((uint16_t *) outptr) = ch;                                          \
+       outptr += 2;                                                          \
+       ++inptr;                                                              \
+      }                                                                              \
+    else                                                                     \
+      {                                                                              \
+       uint_fast32_t cnt;                                                    \
+       uint_fast32_t i;                                                      \
+                                                                             \
+       if (ch >= 0xc2 && ch < 0xe0)                                          \
+         {                                                                   \
+           /* We expect two bytes.  The first byte cannot be 0xc0 or 0xc1,   \
+              otherwise the wide character could have been represented       \
+              using a single byte.  */                                       \
+           cnt = 2;                                                          \
+           ch &= 0x1f;                                                       \
+         }                                                                   \
+       else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))                   \
+         {                                                                   \
+           /* We expect three bytes.  */                                     \
+           cnt = 3;                                                          \
+           ch &= 0x0f;                                                       \
+         }                                                                   \
+       else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))                   \
+         {                                                                   \
+           /* We expect four bytes.  */                                      \
+           cnt = 4;                                                          \
+           ch &= 0x07;                                                       \
+         }                                                                   \
+       else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))                   \
+         {                                                                   \
+           /* We expect five bytes.  */                                      \
+           cnt = 5;                                                          \
+           ch &= 0x03;                                                       \
+         }                                                                   \
+       else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1))                   \
+         {                                                                   \
+           /* We expect six bytes.  */                                       \
+           cnt = 6;                                                          \
+           ch &= 0x01;                                                       \
+         }                                                                   \
+       else                                                                  \
+         {                                                                   \
+           /* Search the end of this ill-formed UTF-8 character.  This       \
+              is the next byte with (x & 0xc0) != 0x80.  */                  \
+           i = 0;                                                            \
+           do                                                                \
+             ++i;                                                            \
+           while (inptr + i < inend                                          \
+                  && (*(inptr + i) & 0xc0) == 0x80                           \
+                  && i < 5);                                                 \
+                                                                             \
+         errout:                                                             \
+           STANDARD_FROM_LOOP_ERR_HANDLER (i);                               \
+         }                                                                   \
+                                                                             \
+       if (__builtin_expect (inptr + cnt > inend, 0))                        \
+         {                                                                   \
+           /* We don't have enough input.  But before we report that check   \
+              that all the bytes are correct.  */                            \
+           for (i = 1; inptr + i < inend; ++i)                               \
+             if ((inptr[i] & 0xc0) != 0x80)                                  \
+               break;                                                        \
+                                                                             \
+           if (__builtin_expect (inptr + i == inend, 1))                     \
+             {                                                               \
+               result = __GCONV_INCOMPLETE_INPUT;                            \
+               break;                                                        \
+             }                                                               \
+                                                                             \
+           goto errout;                                                      \
+         }                                                                   \
+                                                                             \
+       /* Read the possible remaining bytes.  */                             \
+       for (i = 1; i < cnt; ++i)                                             \
+         {                                                                   \
+           uint32_t byte = inptr[i];                                         \
+                                                                             \
+           if ((byte & 0xc0) != 0x80)                                        \
+             /* This is an illegal encoding.  */                             \
+             break;                                                          \
+                                                                             \
+           ch <<= 6;                                                         \
+           ch |= byte & 0x3f;                                                \
+         }                                                                   \
+                                                                             \
+       /* If i < cnt, some trail byte was not >= 0x80, < 0xc0.               \
+          If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could       \
+          have been represented with fewer than cnt bytes.  */               \
+       if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)                \
+           /* Do not accept UTF-16 surrogates.  */                           \
+           || (ch >= 0xd800 && ch <= 0xdfff))                                \
+         {                                                                   \
+           /* This is an illegal encoding.  */                               \
+           goto errout;                                                      \
+         }                                                                   \
+                                                                             \
+       /* Now adjust the pointers and store the result.  */                  \
+       if (ch < 0x10000)                                                     \
+         *((uint16_t *) outptr) = ch;                                        \
+       else                                                                  \
+         {                                                                   \
+           if (__builtin_expect (outptr + 4 > outend, 0))                    \
+             {                                                               \
+               result = __GCONV_FULL_OUTPUT;                                 \
+               break;                                                        \
+             }                                                               \
+                                                                             \
+           *((uint16_t *) outptr) = 0xd7c0 + (ch >> 10);                     \
+           outptr += 2;                                                      \
+           *((uint16_t *) outptr) = 0xdc00 + (ch & 0x3ff);                   \
+         }                                                                   \
+                                                                             \
+       outptr += 2;                                                          \
+       inptr += cnt;                                                         \
+      }                                                                              \
+  }
+#define LOOP_NEED_FLAGS
+
+#define STORE_REST \
+  {                                                                          \
+    /* We store the remaining bytes while converting them into the UCS4              \
+       format.  We can assume that the first byte in the buffer is           \
+       correct and that it requires a larger number of bytes than there              \
+       are in the input buffer.  */                                          \
+    wint_t ch = **inptrp;                                                    \
+    size_t cnt, r;                                                           \
+                                                                             \
+    state->__count = inend - *inptrp;                                        \
+                                                                             \
+    assert (ch != 0xc0 && ch != 0xc1);                                       \
+    if (ch >= 0xc2 && ch < 0xe0)                                             \
+      {                                                                              \
+       /* We expect two bytes.  The first byte cannot be 0xc0 or             \
+          0xc1, otherwise the wide character could have been                 \
+          represented using a single byte.  */                               \
+       cnt = 2;                                                              \
+       ch &= 0x1f;                                                           \
+      }                                                                              \
+    else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))                              \
+      {                                                                              \
+       /* We expect three bytes.  */                                         \
+       cnt = 3;                                                              \
+       ch &= 0x0f;                                                           \
+      }                                                                              \
+    else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))                              \
+      {                                                                              \
+       /* We expect four bytes.  */                                          \
+       cnt = 4;                                                              \
+       ch &= 0x07;                                                           \
+      }                                                                              \
+    else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))                              \
+      {                                                                              \
+       /* We expect five bytes.  */                                          \
+       cnt = 5;                                                              \
+       ch &= 0x03;                                                           \
+      }                                                                              \
+    else                                                                     \
+      {                                                                              \
+       /* We expect six bytes.  */                                           \
+       cnt = 6;                                                              \
+       ch &= 0x01;                                                           \
+      }                                                                              \
+                                                                             \
+    /* The first byte is already consumed.  */                               \
+    r = cnt - 1;                                                             \
+    while (++(*inptrp) < inend)                                                      \
+      {                                                                              \
+       ch <<= 6;                                                             \
+       ch |= **inptrp & 0x3f;                                                \
+       --r;                                                                  \
+      }                                                                              \
+                                                                             \
+    /* Shift for the so far missing bytes.  */                               \
+    ch <<= r * 6;                                                            \
+                                                                             \
+    /* Store the number of bytes expected for the entire sequence.  */       \
+    state->__count |= cnt << 8;                                                      \
+                                                                             \
+    /* Store the value.  */                                                  \
+    state->__value.__wch = ch;                                               \
+  }
+
+#define UNPACK_BYTES \
+  {                                                                          \
+    static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
+    wint_t wch = state->__value.__wch;                                       \
+    size_t ntotal = state->__count >> 8;                                     \
+                                                                             \
+    inlen = state->__count & 255;                                            \
+                                                                             \
+    bytebuf[0] = inmask[ntotal - 2];                                         \
+                                                                             \
+    do                                                                       \
+      {                                                                              \
+       if (--ntotal < inlen)                                                 \
+         bytebuf[ntotal] = 0x80 | (wch & 0x3f);                              \
+       wch >>= 6;                                                            \
+      }                                                                              \
+    while (ntotal > 1);                                                              \
+                                                                             \
+    bytebuf[0] |= wch;                                                       \
+  }
+
+#define CLEAR_STATE \
+  state->__count = 0
+
+
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
index 59c6402..13facc8 100644 (file)
@@ -719,10 +719,12 @@ add_known_names (struct gconv_module *node)
     add_known_names (node->right);
   do
     {
-      if (strcmp (node->from_string, "INTERNAL"))
+      if (strcmp (node->from_string, "INTERNAL") != 0
+         && strcmp (node->from_string, "CHAR16") != 0)
        tsearch (node->from_string, &printlist,
                 (__compar_fn_t) strverscmp);
-      if (strcmp (node->to_string, "INTERNAL") != 0)
+      if (strcmp (node->to_string, "INTERNAL") != 0
+         && strcmp (node->to_string, "CHAR16") != 0)
        tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
 
       node = node->same;
@@ -748,7 +750,8 @@ insert_cache (void)
       {
        const char *str = strtab + hashtab[cnt].string_offset;
 
-       if (strcmp (str, "INTERNAL") != 0)
+       if (strcmp (str, "INTERNAL") != 0
+           && strcmp (str, "CHAR16") != 0)
          tsearch (str, &printlist, (__compar_fn_t) strverscmp);
       }
 }
index 8c446e1..010e0c8 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (C) 1995-2000,2002,2003,2004,2005,2006,2007,2011
+# Copyright (C) 1995-2000,2002,2003,2004,2005,2006,2007,2011,2012
 #      Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 
@@ -46,6 +46,7 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
 strop-tests :=  wcscmp wmemcmp wcslen wcschr wcsrchr wcscpy
 tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \
         tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \
+        tst-c16c32-1 \
         wcsatcliff $(addprefix test-,$(strop-tests))
 
 include ../Rules
index c75ca3b..3fed0b5 100644 (file)
@@ -1,6 +1,6 @@
 /* Copyright (C) 2011, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@cygnus.com>, 2011.
+   Contributed by Ulrich Drepper <drepper@gmail.com>, 2011.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -44,7 +44,12 @@ static mbstate_t state;
 size_t
 c16rtomb (char *s, char16_t c16, mbstate_t *ps)
 {
-  char buf[MB_CUR_MAX];
+#if 1
+  // XXX The ISO C 11 spec I have does not say anything about handling
+  // XXX surrogates in this interface.
+  return wcrtomb (s, c16, ps ?: &state);
+#else
+  char buf[MB_LEN_MAX];
   struct __gconv_step_data data;
   int status;
   size_t result;
@@ -78,9 +83,9 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps)
     PTR_DEMANGLE (fct);
 #endif
 
-  /* If C16 is the NUL character we write into the output buffer the byte
-     sequence necessary for PS to get into the initial state, followed
-     by a NUL byte.  */
+  /* If C16 is the NUL character we write into the output buffer
+     the byte sequence necessary for PS to get into the initial
+     state, followed by a NUL byte.  */
   if (c16 == L'\0')
     {
       status = DL_CALL_FCT (fct, (fcts->fromc16, &data, NULL, NULL,
@@ -96,7 +101,8 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps)
 
       status = DL_CALL_FCT (fct,
                            (fcts->fromc16, &data, &inbuf,
-                            inbuf + sizeof (char16_t), NULL, &dummy, 0, 1));
+                            inbuf + sizeof (char16_t), NULL, &dummy,
+                            0, 1));
     }
 
   /* There must not be any problems with the conversion but illegal input
@@ -118,4 +124,5 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps)
     }
 
   return result;
+#endif
 }
index 7b5822d..df970fb 100644 (file)
@@ -1,6 +1,6 @@
 /* Copyright (C) 2011, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.org>, 2011.
+   Contributed by Ulrich Drepper <drepper@gmail.com>, 2011.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -43,20 +43,32 @@ static mbstate_t state;
 size_t
 mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
 {
-  char16_t buf[1];
+  if (ps == NULL)
+    ps = &state;
+
+  if (ps->__count & 0x80000000)
+    {
+      /* We have to return the second word for a surrogate.  */
+      ps->__count &= 0x7fffffff;
+      *pc16 = ps->__value.__wch;
+      ps->__value.__wch = L'\0';
+      return (size_t) -3;
+    }
+
+  char16_t buf[2];
   struct __gconv_step_data data;
   int status;
   size_t result;
   size_t dummy;
   const unsigned char *inbuf, *endbuf;
-  unsigned char *outbuf = (unsigned char *) (pc16 ?: buf);
+  unsigned char *outbuf = (unsigned char *) buf;
   const struct gconv_fcts *fcts;
 
   /* Set information for this step.  */
   data.__invocation_counter = 0;
   data.__internal_use = 1;
   data.__flags = __GCONV_IS_LAST;
-  data.__statep = ps ?: &state;
+  data.__statep = ps;
   data.__trans = NULL;
 
   /* A first special case is if S is NULL.  This means put PS in the
@@ -85,9 +97,22 @@ mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
   if (fcts->toc16->__shlib_handle != NULL)
     PTR_DEMANGLE (fct);
 #endif
+
+  /* We first have to check whether the character can be represented
+     without a surrogate.  If we immediately pass in a buffer large
+     enough to hold two char16_t values and the first character does
+     not require a surrogate the routine will try to convert more
+     input if N is larger then needed for the first character.  */
   status = DL_CALL_FCT (fct, (fcts->toc16, &data, &inbuf, endbuf,
                              NULL, &dummy, 0, 1));
 
+  if (status == __GCONV_FULL_OUTPUT && data.__outbuf == outbuf)
+    {
+      data.__outbufend = outbuf + 2 * sizeof (char16_t);
+      status = DL_CALL_FCT (fct, (fcts->toc16, &data, &inbuf, endbuf,
+                                 NULL, &dummy, 0, 1));
+    }
+
   /* There must not be any problems with the conversion but illegal input
      characters.  The output buffer must be large enough, otherwise the
      definition of MB_CUR_MAX is not correct.  All the other possible
@@ -100,15 +125,28 @@ mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
   if (status == __GCONV_OK || status == __GCONV_EMPTY_INPUT
       || status == __GCONV_FULL_OUTPUT)
     {
-      if (data.__outbuf != (unsigned char *) outbuf
-         && *(char16_t *) outbuf == U('\0'))
+      if (pc16 != NULL)
+       *pc16 = buf[0];
+
+      if (data.__outbuf != outbuf && *(char16_t *) outbuf == U('\0'))
        {
          /* The converted character is the NUL character.  */
          assert (__mbsinit (data.__statep));
          result = 0;
        }
       else
-       result = inbuf - (const unsigned char *) s;
+       {
+         result = inbuf - (const unsigned char *) s;
+
+         if (data.__outbuf != outbuf + 2)
+           {
+             /* This is a surrogate.  */
+             assert (buf[0] >= 0xd800 && buf[0] <= 0xdfff);
+             assert (buf[1] >= 0xdc00 && buf[1] <= 0xdfff);
+             ps->__count |= 0x80000000;
+             ps->__value.__wch = buf[1];
+           }
+       }
     }
   else if (status == __GCONV_INCOMPLETE_INPUT)
     result = (size_t) -2;
diff --git a/wcsmbs/tst-c16c32-1.c b/wcsmbs/tst-c16c32-1.c
new file mode 100644 (file)
index 0000000..f4534c5
--- /dev/null
@@ -0,0 +1,131 @@
+#include <inttypes.h>
+#include <locale.h>
+#include <stdio.h>
+#include <uchar.h>
+
+
+static int
+do_test (void)
+{
+  if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
+    {
+      puts ("cannot set locale");
+      return 1;
+    }
+
+  int result = 0;
+
+  char32_t c32 = 48;
+  do
+    {
+      if (c32 >= 0xd800 && c32 <= 0xe000)
+       continue;
+
+      char buf[20];
+      size_t n1 = c32rtomb (buf, c32, NULL);
+      if (n1 <= 0)
+       {
+         printf ("c32rtomb for U'\\x%" PRIx32 "' failed\n", (uint32_t) c32);
+         result = 1;
+         continue;
+       }
+
+      char32_t c32out;
+      size_t n2 = mbrtoc32 (&c32out, buf, n1, NULL);
+      if ((ssize_t) n2 < 0)
+       {
+         printf ("mbrtoc32 for U'\\x%" PRIx32 "' failed\n", (uint32_t) c32);
+         result = 1;
+         continue;
+       }
+      if (n2 != n1)
+       {
+         printf ("mbrtoc32 for U'\\x%" PRIx32 "' consumed %zu bytes, not %zu\n",
+                 (uint32_t) c32, n2, n1);
+         result = 1;
+       }
+      else if (c32out != c32)
+       {
+         printf ("mbrtoc32 for U'\\x%" PRIx32 "' produced U'\\x%" PRIx32 "\n",
+                 (uint32_t) c32, (uint32_t) c32out);
+         result = 1;
+       }
+
+      char16_t c16;
+      size_t n3 = mbrtoc16 (&c16, buf, n1, NULL);
+      if (n3 != n1)
+       {
+         printf ("mbrtoc16 for U'\\x%" PRIx32 "' did not consume all bytes\n",
+                 (uint32_t) c32);
+         result = 1;
+         continue;
+       }
+      if (c32 < 0x10000)
+       {
+         if (c16 != c32)
+           {
+             printf ("mbrtoc16 for U'\\x%" PRIx32 "' produce u'\\x%" PRIx16 "'\n",
+                     (uint32_t) c32, (uint16_t) c16);
+             result = 1;
+             continue;
+           }
+       }
+      else
+       {
+         buf[0] = '1';
+         char16_t c16_2;
+         size_t n4 = mbrtoc16 (&c16_2, buf, 1, NULL);
+         if (n4 != (size_t) -3)
+           {
+             printf ("second mbrtoc16 for U'\\x%" PRIx32 "' did not return -3\n",
+                     (uint32_t) c32);
+             result = 1;
+             continue;
+           }
+
+         if (c32 != (((uint32_t) (c16 - 0xd7c0)) << 10) + (c16_2 - 0xdc00))
+           {
+             printf ("mbrtoc16 for U'\\x%" PRIx32 "' returns U'\\x%" PRIx32 "\n",
+                     (uint32_t) c32,
+                     (((uint32_t) (c16 - 0xd7c0)) << 10) + (c16_2 - 0xdc00));
+             result = 1;
+             continue;
+           }
+       }
+
+      buf[0] = '\0';
+      char16_t c16_nul;
+      n3 = mbrtoc16 (&c16_nul, buf, n1, NULL);
+      if (n3 != 0)
+       {
+         printf ("mbrtoc16 for '\\0' returns %zd\n", n3);
+         result = 1;
+         continue;
+       }
+
+      if (c32 < 0x10000)
+       {
+         size_t n5 = c16rtomb (buf, c16, NULL);
+         if ((ssize_t) n5 < 0)
+           {
+             printf ("c16rtomb for U'\\x%" PRIx32 "' failed with %zd\n",
+                     (uint32_t) c32, n5);
+             result = 1;
+             continue;
+           }
+         if (n5 != n1)
+           {
+             printf ("c16rtomb for U'\\x%" PRIx32 "' produced %zu bytes instead of %zu bytes\n",
+                     (uint32_t) c32, n5, n1);
+             result = 1;
+             continue;
+           }
+       }
+    }
+  while ((c32 += 0x1111) <= U'\x12000');
+
+  return result;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
index 547b05a..946fdaf 100644 (file)
@@ -1,4 +1,5 @@
-/* Copyright (C) 1996-1998,2000,2002,2005,2011 Free Software Foundation, Inc.
+/* Copyright (C) 1996-1998,2000,2002,2005,2011,2012
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -38,7 +39,7 @@ static mbstate_t state;
 size_t
 __wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 {
-  char buf[MB_CUR_MAX];
+  char buf[MB_LEN_MAX];
   struct __gconv_step_data data;
   int status;
   size_t result;
index 212a6c8..9ce26f1 100644 (file)
@@ -1,4 +1,5 @@
-/* Copyright (C) 1998-2002,2004,2005,2008,2010,2011 Free Software Foundation, Inc.
+/* Copyright (C) 1998-2002,2004,2005,2008,2010,2011,2012
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
 
@@ -74,7 +75,7 @@ static const struct __gconv_step to_c16 =
   .__counter = INT_MAX,
   .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
   .__to_name = (char *) "UTF-16//",
-  .__fct = __gconv_transform_ascii_utf16,
+  .__fct = __gconv_transform_ascii_char16,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -93,7 +94,7 @@ static const struct __gconv_step from_c16 =
   .__counter = INT_MAX,
   .__from_name = (char *) "UTF-16//",
   .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_utf16_ascii,
+  .__fct = __gconv_transform_char16_ascii,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -209,7 +210,7 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
       int use_translit;
 
       /* Allocate the gconv_fcts structure.  */
-      new_fcts = malloc (sizeof *new_fcts);
+      new_fcts = calloc (1, sizeof *new_fcts);
       if (new_fcts == NULL)
        goto failed;
 
@@ -229,16 +230,24 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
         represent all others.  */
       new_fcts->towc = __wcsmbs_getfct ("INTERNAL", complete_name,
                                        &new_fcts->towc_nsteps);
-      new_fcts->tomb = (new_fcts->towc != NULL
-                       ? __wcsmbs_getfct (complete_name, "INTERNAL",
-                                          &new_fcts->tomb_nsteps)
-                       : NULL);
+      if (new_fcts->towc != NULL)
+       new_fcts->tomb = __wcsmbs_getfct (complete_name, "INTERNAL",
+                                         &new_fcts->tomb_nsteps);
 
-      // XXX
-      new_fcts->toc16 = (struct __gconv_step *) &to_c16;
-      new_fcts->toc16_nsteps = 1;
-      new_fcts->fromc16 = (struct __gconv_step *) &from_c16;
-      new_fcts->fromc16_nsteps = 1;
+      if (new_fcts->tomb != NULL)
+       {
+         new_fcts->toc16 = __wcsmbs_getfct ("CHAR16", complete_name,
+                                            &new_fcts->toc16_nsteps);
+
+         if (new_fcts->toc16 != NULL)
+           new_fcts->fromc16 = __wcsmbs_getfct (complete_name, "CHAR16",
+                                                &new_fcts->fromc16_nsteps);
+         else
+           {
+             __gconv_close_transform (new_fcts->toc16, new_fcts->toc16_nsteps);
+             new_fcts->toc16 = NULL;
+           }
+       }
 
       /* If any of the conversion functions is not available we don't
         use any since this would mean we cannot convert back and
@@ -255,6 +264,12 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
        }
       else
        {
+         // XXX At least for now we live with the CHAR16 not being available.
+         if (new_fcts->toc16 == NULL)
+           new_fcts->toc16 = __wcsmbs_gconv_fcts_c.toc16;
+         if (new_fcts->fromc16 == NULL)
+           new_fcts->fromc16 = __wcsmbs_gconv_fcts_c.fromc16;
+
          new_category->private.ctype = new_fcts;
          new_category->private.cleanup = &_nl_cleanup_ctype;
        }
@@ -277,11 +292,15 @@ __wcsmbs_clone_conv (struct gconv_fcts *copy)
   *copy = *orig;
 
   /* Now increment the usage counters.
-     Note: This assumes copy->towc_nsteps == 1 and copy->tomb_nsteps == 1.  */
+     Note: This assumes copy->*_nsteps == 1.  */
   if (copy->towc->__shlib_handle != NULL)
     ++copy->towc->__counter;
   if (copy->tomb->__shlib_handle != NULL)
     ++copy->tomb->__counter;
+  if (copy->toc16->__shlib_handle != NULL)
+    ++copy->toc16->__counter;
+  if (copy->fromc16->__shlib_handle != NULL)
+    ++copy->fromc16->__counter;
 }
 
 
@@ -296,30 +315,24 @@ __wcsmbs_named_conv (struct gconv_fcts *copy, const char *name)
 
   copy->tomb = __wcsmbs_getfct (name, "INTERNAL", &copy->tomb_nsteps);
   if (copy->tomb == NULL)
-    goto out_mb;
-
-#if 0
-  copy->fromc16 = __wcsmbs_getfct (name, "UTF-16//", &copy->fromc16_nsteps);
-  if (copy->fromc16 == NULL)
-    goto out_fromc16;
-
-  copy->toc16 = __wcsmbs_getfct ("UTF-16//", name, &copy->toc16_nsteps);
-  if (copy->toc16 == NULL)
-#else
-  if (0)
-#endif
     {
-#if 0
-      __gconv_close_transform (copy->fromc16, copy->fromc16_nsteps);
-    out_fromc16:
-      __gconv_close_transform (copy->tomb, copy->tomb_nsteps);
-#endif
-    out_mb:
       __gconv_close_transform (copy->towc, copy->towc_nsteps);
-    out_wc:
       return 1;
     }
 
+  copy->fromc16 = __wcsmbs_getfct (name, "CHAR16", &copy->fromc16_nsteps);
+  if (copy->fromc16 == NULL)
+    copy->toc16 = NULL;
+  else
+    {
+      copy->toc16 = __wcsmbs_getfct ("CHAR16", name, &copy->toc16_nsteps);
+      if (copy->toc16 == NULL)
+       {
+         __gconv_close_transform (copy->fromc16, copy->fromc16_nsteps);
+         copy->fromc16 = NULL;
+       }
+    }
+
   return 0;
 }
 
@@ -335,11 +348,8 @@ _nl_cleanup_ctype (struct __locale_data *locale)
       /* Free the old conversions.  */
       __gconv_close_transform (data->tomb, data->tomb_nsteps);
       __gconv_close_transform (data->towc, data->towc_nsteps);
-#if 0
-      // XXX
       __gconv_close_transform (data->fromc16, data->fromc16_nsteps);
-      __gconv_close_transform (data->toc16, data->toc16c_nsteps);
-#endif
+      __gconv_close_transform (data->toc16, data->toc16_nsteps);
       free ((char *) data);
     }
 }