iconv: Add UTF-7-IMAP variant in utf-7.c
authorMax Gautier <mg@max.gautier.name>
Mon, 21 Mar 2022 12:25:05 +0000 (09:25 -0300)
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>
Mon, 21 Mar 2022 16:23:57 +0000 (13:23 -0300)
UTF-7-IMAP differs from UTF-7 in the followings ways (see RFC 3501[1]
for reference) :

- The shift character is '&' instead of '+'
- There is no "optional direct characters" and the "direct characters"
  set is different
- There is no implicit shift back to US-ASCII from BASE64, all BASE64
  sequences MUST be terminated with '-'

[1]: https://datatracker.ietf.org/doc/html/rfc3501#section-5.1.3

Signed-off-by: Max Gautier <mg@max.gautier.name>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
iconvdata/TESTS
iconvdata/gconv-modules
iconvdata/testdata/UTF-7-IMAP [new file with mode: 0644]
iconvdata/testdata/UTF-7-IMAP..UTF8 [new file with mode: 0644]
iconvdata/utf-7.c

index 6c0eafc..528ee17 100644 (file)
@@ -94,6 +94,7 @@ EUC-TW                        EUC-TW                  Y       UTF8
 GBK                    GBK                     Y       UTF8
 BIG5HKSCS              BIG5HKSCS               Y       UTF8
 UTF-7                  UTF-7                   N       UTF8
+UTF-7-IMAP             UTF-7-IMAP              N       UTF8
 IBM856                 IBM856                  N       UTF8
 IBM922                 IBM922                  Y       UTF8
 IBM930                 IBM930                  N       UTF8
index 7ee0757..7cd5455 100644 (file)
@@ -113,3 +113,7 @@ module      INTERNAL                UTF-32BE//              UTF-32          1
 alias  UTF7//                  UTF-7//
 module UTF-7//                 INTERNAL                UTF-7           1
 module INTERNAL                UTF-7//                 UTF-7           1
+
+#      from                    to                      module          cost
+module UTF-7-IMAP//            INTERNAL                UTF-7           1
+module INTERNAL                UTF-7-IMAP//            UTF-7           1
diff --git a/iconvdata/testdata/UTF-7-IMAP b/iconvdata/testdata/UTF-7-IMAP
new file mode 100644 (file)
index 0000000..6b5dada
--- /dev/null
@@ -0,0 +1 @@
+&EqASGxItEps-       Amharic&AAoBDQ-esky      Czech&AAo-Dansk      Danish&AAo-English    English&AAo-Suomi      Finnish&AAo-Fran&AOc-ais   French&AAo-Deutsch    German&AAoDlQO7A7sDtwO9A7kDugOs-   Greek&AAoF4gXRBegF2QXq-      Hebrew&AAo-Italiano   Italian&AAo-Norsk      Norwegian&AAoEIARDBEEEQQQ6BDgEOQ-    Russian&AAo-Espa&APE-ol    Spanish&AAo-Svenska    Swedish&AAoOIA4yDikOMg5EDhcOIg-    Thai&AAo-T&APw-rk&AOc-e     Turkish&AAo-Ti&Hr8-ng Vi&Hsc-t Vietnamese&AApl5Wcsip4-     Japanese&AApOLWWH-       Chinese&AArVXK4A-       Korean&AAoACg-// Checking for correct handling of shift characters ('&-', '-') after base64 sequences&AArVXK4A-&-&AArVXK4A--&AAoACg-// Checking for correct handling of litteral '&-' and '-'&AAo----&-&--&AAoACg-// The last line of this file is missing the end-of-line terminator&AAo-// on purpose, in order to test that the conversion empties the bit buffer&AAo-// and shifts back to the initial state at the end of the conversion.&AAo-A&ImIDkQ-
\ No newline at end of file
diff --git a/iconvdata/testdata/UTF-7-IMAP..UTF8 b/iconvdata/testdata/UTF-7-IMAP..UTF8
new file mode 100644 (file)
index 0000000..8b9add3
--- /dev/null
@@ -0,0 +1,32 @@
+አማርኛ       Amharic
+česky      Czech
+Dansk      Danish
+English    English
+Suomi      Finnish
+Français   French
+Deutsch    German
+Ελληνικά   Greek
+עברית      Hebrew
+Italiano   Italian
+Norsk      Norwegian
+Русский    Russian
+Español    Spanish
+Svenska    Swedish
+ภาษาไทย    Thai
+Türkçe     Turkish
+Tiếng Việt Vietnamese
+日本語     Japanese
+中文       Chinese
+한글       Korean
+
+// Checking for correct handling of shift characters ('&', '-') after base64 sequences
+한글&
+한글-
+
+// Checking for correct handling of litteral '&' and '-'
+---&&-
+
+// The last line of this file is missing the end-of-line terminator
+// on purpose, in order to test that the conversion empties the bit buffer
+// and shifts back to the initial state at the end of the conversion.
+A≢Α
\ No newline at end of file
index 393fd35..1b1f68e 100644 (file)
 enum variant
 {
   UTF7,
+  UTF_7_IMAP
 };
 
 /* Must be in the same order as enum variant above.  */
 static const char names[] =
   "UTF-7//\0"
+  "UTF-7-IMAP//\0"
   "\0";
 
 static uint32_t
@@ -45,6 +47,8 @@ shift_character (enum variant const var)
 {
   if (var == UTF7)
     return '+';
+  else if (var == UTF_7_IMAP)
+    return '&';
   else
     abort ();
 }
@@ -58,6 +62,9 @@ between (uint32_t const ch,
 
 /* The set of "direct characters":
    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
+   FOR UTF-7-IMAP
+   A-Z a-z 0-9 ' ( ) , - . / : ? space
+   ! " # $ % + * ; < = > @ [ \ ] ^ _ ` { | } ~
 */
 
 static bool
@@ -71,6 +78,8 @@ isdirect (uint32_t ch, enum variant var)
            || between (ch, ',', '/')
            || ch == ':' || ch == '?'
            || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
+  else if (var == UTF_7_IMAP)
+    return (ch != '&' && between (ch, ' ', '~'));
   abort ();
 }
 
@@ -124,6 +133,8 @@ base64 (unsigned int i, enum variant var)
     return '+';
   else if (i == 63 && var == UTF7)
     return '/';
+  else if (i == 63 && var == UTF_7_IMAP)
+    return ',';
   else
     abort ();
 }
@@ -308,7 +319,8 @@ gconv_end (struct __gconv_step *data)
          i = ch - '0' + 52;                                                  \
        else if (ch == '+')                                                   \
          i = 62;                                                             \
-       else if (ch == '/')                                                   \
+       else if ((var == UTF7 && ch == '/')                                   \
+                 || (var == UTF_7_IMAP && ch == ','))                        \
          i = 63;                                                             \
        else                                                                  \
          {                                                                   \
@@ -316,8 +328,10 @@ gconv_end (struct __gconv_step *data)
                                                                              \
            /* If accumulated data is nonzero, the input is invalid.  */      \
            /* Also, partial UTF-16 characters are invalid.  */               \
-           if (__builtin_expect (statep->__value.__wch != 0, 0)              \
-               || __builtin_expect ((statep->__count >> 3) <= 26, 0))        \
+           /* In IMAP variant, must be terminated by '-'.  */                \
+           if (__glibc_unlikely (statep->__value.__wch != 0)                 \
+               || __glibc_unlikely ((statep->__count >> 3) <= 26)            \
+               || __glibc_unlikely (var == UTF_7_IMAP && ch != '-'))         \
              {                                                               \
                STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));    \
              }                                                               \
@@ -474,13 +488,15 @@ gconv_end (struct __gconv_step *data)
     else                                                                     \
       {                                                                              \
        /* base64 encoding active */                                          \
-       if (isdirect (ch, var))                                               \
+       if ((var == UTF_7_IMAP && ch == '&') || isdirect (ch, var))           \
          {                                                                   \
            /* deactivate base64 encoding */                                  \
            size_t count;                                                     \
                                                                              \
            count = ((statep->__count & 0x18) >= 0x10)                        \
-             + needs_explicit_shift (ch) + 1;                                \
+             + (var == UTF_7_IMAP || needs_explicit_shift (ch))              \
+             + (var == UTF_7_IMAP && ch == '&')                              \
+             + 1;                                                            \
            if (__glibc_unlikely (outptr + count > outend))                   \
              {                                                               \
                result = __GCONV_FULL_OUTPUT;                                 \
@@ -489,9 +505,11 @@ gconv_end (struct __gconv_step *data)
                                                                              \
            if ((statep->__count & 0x18) >= 0x10)                             \
              *outptr++ = base64 ((statep->__count >> 3) & ~3, var);          \
-           if (needs_explicit_shift (ch))                                    \
+           if (var == UTF_7_IMAP || needs_explicit_shift (ch))               \
              *outptr++ = '-';                                                \
            *outptr++ = (unsigned char) ch;                                   \
+           if (var == UTF_7_IMAP && ch == '&')                               \
+             *outptr++ = '-';                                                \
            statep->__count = 0;                                              \
          }                                                                   \
        else                                                                  \