source/common/ucnv_u7.c

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u7.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  22
  23 #include "cmemory.h"
  24 #include "unicode/ucnv.h"
  25 #include "ucnv_bld.h"
  26 #include "ucnv_cnv.h"
  27 #include "uassert.h"
  28
  29 /* UTF-7 -------------------------------------------------------------------- */
  30
  31 /*
  32  * UTF-7 is a stateful encoding of Unicode.
  33  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  34  * It was intended for use in Internet email systems, using in its bytewise
  35  * encoding only a subset of 7-bit US-ASCII.
  36  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  37  * occasionally used.
  38  *
  39  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  40  * characters directly or in base64. Especially, the characters in set O
  41  * as defined in the RFC (see below) may be encoded directly but are not
  42  * allowed in, e.g., email headers.
  43  * By default, the ICU UTF-7 converter encodes set O directly.
  44  * By choosing the option "version=1", set O will be escaped instead.
  45  * For example:
  46  *     utf7Converter=ucnv_open("UTF-7,version=1");
  47  *
  48  * For details about email headers see RFC 2047.
  49  */
  50
  51 /*
  52  * Tests for US-ASCII characters belonging to character classes
  53  * defined in UTF-7.
  54  *
  55  * Set D (directly encoded characters) consists of the following
  56  * characters: the upper and lower case letters A through Z
  57  * and a through z, the 10 digits 0-9, and the following nine special
  58  * characters (note that "+" and "=" are omitted):
  59  *     '(),-./:?
  60  *
  61  * Set O (optional direct characters) consists of the following
  62  * characters (note that "\" and "~" are omitted):
  63  *     !"#$%&*;<=>@[]^_`{|}
  64  *
  65  * According to the rules in RFC 2152, the byte values for the following
  66  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  67  * - all C0 control codes except for CR LF TAB
  68  * - BACKSLASH
  69  * - TILDE
  70  * - DEL
  71  * - all codes beyond US-ASCII, i.e. all >127
  72  */
  73 #define inSetD(c) \
  74     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  75      (uint8_t)((c)-48)<10 ||    /* digits */ \
  76      (uint8_t)((c)-39)<3 ||     /* '() */ \
  77      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  78      (c)==58 || (c)==63         /* :? */ \
  79     )
  80
  81 #define inSetO(c) \
  82     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  83      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  84      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  85      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  86      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  87     )
  88
  89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  91
  92 #define PLUS  43
  93 #define MINUS 45
  94 #define BACKSLASH 92
  95 #define TILDE 126
  96
  97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  99
 100 /* encode directly sets D and O and CR LF SP TAB */
 101 static const UBool encodeDirectlyMaximum[128]={
 102  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 103     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 105
 106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 108
 109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 111
 112     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 114 };
 115
 116 /* encode directly set D and CR LF SP TAB but not set O */
 117 static const UBool encodeDirectlyRestricted[128]={
 118  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 119     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 120     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 121
 122     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 123     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 124
 125     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 126     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 127
 128     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 129     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 130 };
 131
 132 static const uint8_t
 133 toBase64[64]={
 134     /* A-Z */
 135     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 136     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 137     /* a-z */
 138     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 139     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 140     /* 0-9 */
 141     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 142     /* +/ */
 143     43, 47
 144 };
 145
 146 static const int8_t
 147 fromBase64[128]={
 148     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 149     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 150     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 151
 152     /* general punctuation with + and / and a special value (-2) for - */
 153     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 154     /* digits */
 155     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 156
 157     /* A-Z */
 158     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 159     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 160
 161     /* a-z */
 162     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 163     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 164 };
 165
 166 /*
 167  * converter status values:
 168  *
 169  * toUnicodeStatus:
 170  *     24 inDirectMode (boolean)
 171  * 23..16 base64Counter (-1..7)
 172  * 15..0  bits (up to 14 bits incoming base64)
 173  *
 174  * fromUnicodeStatus:
 175  * 31..28 version (0: set O direct  1: set O escaped)
 176  *     24 inDirectMode (boolean)
 177  * 23..16 base64Counter (0..2)
 178  *  7..0  bits (6 bits outgoing base64)
 179  *
 180  */
 181
 182 static void
 183 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 184     if(choice<=UCNV_RESET_TO_UNICODE) {
 185         /* reset toUnicode */
 186         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 187         cnv->toULength=0;
 188     }
 189     if(choice!=UCNV_RESET_TO_UNICODE) {
 190         /* reset fromUnicode */
 191         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 192     }
 193 }
 194
 195 static void
 196 _UTF7Open(UConverter *cnv,
 197           UConverterLoadArgs *pArgs,
 198           UErrorCode *pErrorCode) {
 199     if(UCNV_GET_VERSION(cnv)<=1) {
 200         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
 201         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
 202         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 203     } else {
 204         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 205     }
 206 }
 207
 208 static void
 209 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 210                           UErrorCode *pErrorCode) {
 211     UConverter *cnv;
 212     const uint8_t *source, *sourceLimit;
 213     UChar *target;
 214     const UChar *targetLimit;
 215     int32_t *offsets;
 216
 217     uint8_t *bytes;
 218     uint8_t byteIndex;
 219
 220     int32_t length, targetCapacity;
 221
 222     /* UTF-7 state */
 223     uint16_t bits;
 224     int8_t base64Counter;
 225     UBool inDirectMode;
 226
 227     int8_t base64Value;
 228
 229     int32_t sourceIndex, nextSourceIndex;
 230
 231     uint8_t b;
 232     /* set up the local pointers */
 233     cnv=pArgs->converter;
 234
 235     source=(const uint8_t *)pArgs->source;
 236     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 237     target=pArgs->target;
 238     targetLimit=pArgs->targetLimit;
 239     offsets=pArgs->offsets;
 240     /* get the state machine state */
 241     {
 242         uint32_t status=cnv->toUnicodeStatus;
 243         inDirectMode=(UBool)((status>>24)&1);
 244         base64Counter=(int8_t)(status>>16);
 245         bits=(uint16_t)status;
 246     }
 247     bytes=cnv->toUBytes;
 248     byteIndex=cnv->toULength;
 249
 250     /* sourceIndex=-1 if the current character began in the previous buffer */
 251     sourceIndex=byteIndex==0 ? 0 : -1;
 252     nextSourceIndex=0;
 253
 254     if(inDirectMode) {
 255 directMode:
 256         /*
 257          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 258          * with their US-ASCII byte values.
 259          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 260          * A plus sign starts Unicode (or "escape") Mode.
 261          *
 262          * In Direct Mode, only the sourceIndex is used.
 263          */
 264         byteIndex=0;
 265         length=(int32_t)(sourceLimit-source);
 266         targetCapacity=(int32_t)(targetLimit-target);
 267         if(length>targetCapacity) {
 268             length=targetCapacity;
 269         }
 270         while(length>0) {
 271             b=*source++;
 272             if(!isLegalUTF7(b)) {
 273                 /* illegal */
 274                 bytes[0]=b;
 275                 byteIndex=1;
 276                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 277                 break;
 278             } else if(b!=PLUS) {
 279                 /* write directly encoded character */
 280                 *target++=b;
 281                 if(offsets!=NULL) {
 282                     *offsets++=sourceIndex++;
 283                 }
 284             } else /* PLUS */ {
 285                 /* switch to Unicode mode */
 286                 nextSourceIndex=++sourceIndex;
 287                 inDirectMode=FALSE;
 288                 byteIndex=0;
 289                 bits=0;
 290                 base64Counter=-1;
 291                 goto unicodeMode;
 292             }
 293             --length;
 294         }
 295         if(source<sourceLimit && target>=targetLimit) {
 296             /* target is full */
 297             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 298         }
 299     } else {
 300 unicodeMode:
 301         /*
 302          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 303          * The base64 sequence ends with any character that is not in the base64 alphabet.
 304          * A terminating minus sign is consumed.
 305          *
 306          * In Unicode Mode, the sourceIndex has the index to the start of the current
 307          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 308          * keeping the index to the following byte.
 309          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 310          */
 311         while(source<sourceLimit) {
 312             if(target<targetLimit) {
 313                 bytes[byteIndex++]=b=*source++;
 314                 ++nextSourceIndex;
 315                 base64Value = -3; /* initialize as illegal */
 316                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
 317                     /* either
 318                      * base64Value==-1 for any legal character except base64 and minus sign, or
 319                      * base64Value==-3 for illegal characters:
 320                      * 1. In either case, leave Unicode mode.
 321                      * 2.1. If we ended with an incomplete UChar or none after the +, then
 322                      *      generate an error for the preceding erroneous sequence and deal with
 323                      *      the current (possibly illegal) character next time through.
 324                      * 2.2. Else the current char comes after a complete UChar, which was already
 325                      *      pushed to the output buf, so:
 326                      * 2.2.1. If the current char is legal, just save it for processing next time.
 327                      *        It may be for example, a plus which we need to deal with in direct mode.
 328                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
 329                      */
 330                     inDirectMode=TRUE;
 331                     if(base64Counter==-1) {
 332                         /* illegal: + immediately followed by something other than base64 or minus sign */
 333                         /* include the plus sign in the reported sequence, but not the subsequent char */
 334                         --source;
 335                         bytes[0]=PLUS;
 336                         byteIndex=1;
 337                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 338                         break;
 339                     } else if(bits!=0) {
 340                         /* bits are illegally left over, a UChar is incomplete */
 341                         /* don't include current char (legal or illegal) in error seq */
 342                         --source;
 343                         --byteIndex;
 344                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 345                         break;
 346                     } else {
 347                         /* previous UChar was complete */
 348                         if(base64Value==-3) {
 349                             /* current character is illegal, deal with it here */
 350                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 351                             break;
 352                         } else {
 353                             /* un-read the current character in case it is a plus sign */
 354                             --source;
 355                             sourceIndex=nextSourceIndex-1;
 356                             goto directMode;
 357                         }
 358                     }
 359                 } else if(base64Value>=0) {
 360                     /* collect base64 bytes into UChars */
 361                     switch(base64Counter) {
 362                     case -1: /* -1 is immediately after the + */
 363                     case 0:
 364                         bits=base64Value;
 365                         base64Counter=1;
 366                         break;
 367                     case 1:
 368                     case 3:
 369                     case 4:
 370                     case 6:
 371                         bits=(uint16_t)((bits<<6)|base64Value);
 372                         ++base64Counter;
 373                         break;
 374                     case 2:
 375                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 376                         if(offsets!=NULL) {
 377                             *offsets++=sourceIndex;
 378                             sourceIndex=nextSourceIndex-1;
 379                         }
 380                         bytes[0]=b; /* keep this byte in case an error occurs */
 381                         byteIndex=1;
 382                         bits=(uint16_t)(base64Value&3);
 383                         base64Counter=3;
 384                         break;
 385                     case 5:
 386                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 387                         if(offsets!=NULL) {
 388                             *offsets++=sourceIndex;
 389                             sourceIndex=nextSourceIndex-1;
 390                         }
 391                         bytes[0]=b; /* keep this byte in case an error occurs */
 392                         byteIndex=1;
 393                         bits=(uint16_t)(base64Value&15);
 394                         base64Counter=6;
 395                         break;
 396                     case 7:
 397                         *target++=(UChar)((bits<<6)|base64Value);
 398                         if(offsets!=NULL) {
 399                             *offsets++=sourceIndex;
 400                             sourceIndex=nextSourceIndex;
 401                         }
 402                         byteIndex=0;
 403                         bits=0;
 404                         base64Counter=0;
 405                         break;
 406                     default:
 407                         /* will never occur */
 408                         break;
 409                     }
 410                 } else /*base64Value==-2*/ {
 411                     /* minus sign terminates the base64 sequence */
 412                     inDirectMode=TRUE;
 413                     if(base64Counter==-1) {
 414                         /* +- i.e. a minus immediately following a plus */
 415                         *target++=PLUS;
 416                         if(offsets!=NULL) {
 417                             *offsets++=sourceIndex-1;
 418                         }
 419                     } else {
 420                         /* absorb the minus and leave the Unicode Mode */
 421                         if(bits!=0) {
 422                             /* bits are illegally left over, a UChar is incomplete */
 423                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 424                             break;
 425                         }
 426                     }
 427                     sourceIndex=nextSourceIndex;
 428                     goto directMode;
 429                 }
 430             } else {
 431                 /* target is full */
 432                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 433                 break;
 434             }
 435         }
 436     }
 437
 438     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 439         /*
 440          * if we are in Unicode mode, then the byteIndex might not be 0,
 441          * but that is ok if bits==0
 442          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 443          * (not true for IMAP-mailbox-name where we must end in direct mode)
 444          */
 445         byteIndex=0;
 446     }
 447
 448     /* set the converter state back into UConverter */
 449     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 450     cnv->toULength=byteIndex;
 451
 452     /* write back the updated pointers */
 453     pArgs->source=(const char *)source;
 454     pArgs->target=target;
 455     pArgs->offsets=offsets;
 456     return;
 457 }
 458
 459 static void
 460 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 461                             UErrorCode *pErrorCode) {
 462     UConverter *cnv;
 463     const UChar *source, *sourceLimit;
 464     uint8_t *target, *targetLimit;
 465     int32_t *offsets;
 466
 467     int32_t length, targetCapacity, sourceIndex;
 468     UChar c;
 469
 470     /* UTF-7 state */
 471     const UBool *encodeDirectly;
 472     uint8_t bits;
 473     int8_t base64Counter;
 474     UBool inDirectMode;
 475
 476     /* set up the local pointers */
 477     cnv=pArgs->converter;
 478
 479     /* set up the local pointers */
 480     source=pArgs->source;
 481     sourceLimit=pArgs->sourceLimit;
 482     target=(uint8_t *)pArgs->target;
 483     targetLimit=(uint8_t *)pArgs->targetLimit;
 484     offsets=pArgs->offsets;
 485
 486     /* get the state machine state */
 487     {
 488         uint32_t status=cnv->fromUnicodeStatus;
 489         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 490         inDirectMode=(UBool)((status>>24)&1);
 491         base64Counter=(int8_t)(status>>16);
 492         bits=(uint8_t)status;
 493         U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
 494     }
 495
 496     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 497     sourceIndex=0;
 498
 499     if(inDirectMode) {
 500 directMode:
 501         length=(int32_t)(sourceLimit-source);
 502         targetCapacity=(int32_t)(targetLimit-target);
 503         if(length>targetCapacity) {
 504             length=targetCapacity;
 505         }
 506         while(length>0) {
 507             c=*source++;
 508             /* currently always encode CR LF SP TAB directly */
 509             if(c<=127 && encodeDirectly[c]) {
 510                 /* encode directly */
 511                 *target++=(uint8_t)c;
 512                 if(offsets!=NULL) {
 513                     *offsets++=sourceIndex++;
 514                 }
 515             } else if(c==PLUS) {
 516                 /* output +- for + */
 517                 *target++=PLUS;
 518                 if(target<targetLimit) {
 519                     *target++=MINUS;
 520                     if(offsets!=NULL) {
 521                         *offsets++=sourceIndex;
 522                         *offsets++=sourceIndex++;
 523                     }
 524                     /* realign length and targetCapacity */
 525                     goto directMode;
 526                 } else {
 527                     if(offsets!=NULL) {
 528                         *offsets++=sourceIndex++;
 529                     }
 530                     cnv->charErrorBuffer[0]=MINUS;
 531                     cnv->charErrorBufferLength=1;
 532                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 533                     break;
 534                 }
 535             } else {
 536                 /* un-read this character and switch to Unicode Mode */
 537                 --source;
 538                 *target++=PLUS;
 539                 if(offsets!=NULL) {
 540                     *offsets++=sourceIndex;
 541                 }
 542                 inDirectMode=FALSE;
 543                 base64Counter=0;
 544                 goto unicodeMode;
 545             }
 546             --length;
 547         }
 548         if(source<sourceLimit && target>=targetLimit) {
 549             /* target is full */
 550             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 551         }
 552     } else {
 553 unicodeMode:
 554         while(source<sourceLimit) {
 555             if(target<targetLimit) {
 556                 c=*source++;
 557                 if(c<=127 && encodeDirectly[c]) {
 558                     /* encode directly */
 559                     inDirectMode=TRUE;
 560
 561                     /* trick: back out this character to make this easier */
 562                     --source;
 563
 564                     /* terminate the base64 sequence */
 565                     if(base64Counter!=0) {
 566                         /* write remaining bits for the previous character */
 567                         *target++=toBase64[bits];
 568                         if(offsets!=NULL) {
 569                             *offsets++=sourceIndex-1;
 570                         }
 571                     }
 572                     if(fromBase64[c]!=-1) {
 573                         /* need to terminate with a minus */
 574                         if(target<targetLimit) {
 575                             *target++=MINUS;
 576                             if(offsets!=NULL) {
 577                                 *offsets++=sourceIndex-1;
 578                             }
 579                         } else {
 580                             cnv->charErrorBuffer[0]=MINUS;
 581                             cnv->charErrorBufferLength=1;
 582                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 583                             break;
 584                         }
 585                     }
 586                     goto directMode;
 587                 } else {
 588                     /*
 589                      * base64 this character:
 590                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 591                      * and the bits of this character, each implicitly in UTF-16BE.
 592                      *
 593                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 594                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 595                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 596                      */
 597                     switch(base64Counter) {
 598                     case 0:
 599                         *target++=toBase64[c>>10];
 600                         if(target<targetLimit) {
 601                             *target++=toBase64[(c>>4)&0x3f];
 602                             if(offsets!=NULL) {
 603                                 *offsets++=sourceIndex;
 604                                 *offsets++=sourceIndex++;
 605                             }
 606                         } else {
 607                             if(offsets!=NULL) {
 608                                 *offsets++=sourceIndex++;
 609                             }
 610                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 611                             cnv->charErrorBufferLength=1;
 612                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 613                         }
 614                         bits=(uint8_t)((c&15)<<2);
 615                         base64Counter=1;
 616                         break;
 617                     case 1:
 618                         *target++=toBase64[bits|(c>>14)];
 619                         if(target<targetLimit) {
 620                             *target++=toBase64[(c>>8)&0x3f];
 621                             if(target<targetLimit) {
 622                                 *target++=toBase64[(c>>2)&0x3f];
 623                                 if(offsets!=NULL) {
 624                                     *offsets++=sourceIndex;
 625                                     *offsets++=sourceIndex;
 626                                     *offsets++=sourceIndex++;
 627                                 }
 628                             } else {
 629                                 if(offsets!=NULL) {
 630                                     *offsets++=sourceIndex;
 631                                     *offsets++=sourceIndex++;
 632                                 }
 633                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 634                                 cnv->charErrorBufferLength=1;
 635                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 636                             }
 637                         } else {
 638                             if(offsets!=NULL) {
 639                                 *offsets++=sourceIndex++;
 640                             }
 641                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 642                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 643                             cnv->charErrorBufferLength=2;
 644                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 645                         }
 646                         bits=(uint8_t)((c&3)<<4);
 647                         base64Counter=2;
 648                         break;
 649                     case 2:
 650                         *target++=toBase64[bits|(c>>12)];
 651                         if(target<targetLimit) {
 652                             *target++=toBase64[(c>>6)&0x3f];
 653                             if(target<targetLimit) {
 654                                 *target++=toBase64[c&0x3f];
 655                                 if(offsets!=NULL) {
 656                                     *offsets++=sourceIndex;
 657                                     *offsets++=sourceIndex;
 658                                     *offsets++=sourceIndex++;
 659                                 }
 660                             } else {
 661                                 if(offsets!=NULL) {
 662                                     *offsets++=sourceIndex;
 663                                     *offsets++=sourceIndex++;
 664                                 }
 665                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 666                                 cnv->charErrorBufferLength=1;
 667                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 668                             }
 669                         } else {
 670                             if(offsets!=NULL) {
 671                                 *offsets++=sourceIndex++;
 672                             }
 673                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 674                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 675                             cnv->charErrorBufferLength=2;
 676                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 677                         }
 678                         bits=0;
 679                         base64Counter=0;
 680                         break;
 681                     default:
 682                         /* will never occur */
 683                         break;
 684                     }
 685                 }
 686             } else {
 687                 /* target is full */
 688                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 689                 break;
 690             }
 691         }
 692     }
 693
 694     if(pArgs->flush && source>=sourceLimit) {
 695         /* flush remaining bits to the target */
 696         if(!inDirectMode) {
 697             if (base64Counter!=0) {
 698                 if(target<targetLimit) {
 699                     *target++=toBase64[bits];
 700                     if(offsets!=NULL) {
 701                         *offsets++=sourceIndex-1;
 702                     }
 703                 } else {
 704                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 705                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 706                 }
 707             }
 708             /* Add final MINUS to terminate unicodeMode */
 709             if(target<targetLimit) {
 710                 *target++=MINUS;
 711                 if(offsets!=NULL) {
 712                     *offsets++=sourceIndex-1;
 713                 }
 714             } else {
 715                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
 716                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 717             }
 718         }
 719         /* reset the state for the next conversion */
 720         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 721     } else {
 722         /* set the converter state back into UConverter */
 723         cnv->fromUnicodeStatus=
 724             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 725             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 726     }
 727
 728     /* write back the updated pointers */
 729     pArgs->source=source;
 730     pArgs->target=(char *)target;
 731     pArgs->offsets=offsets;
 732     return;
 733 }
 734
 735 static const char *
 736 _UTF7GetName(const UConverter *cnv) {
 737     switch(cnv->fromUnicodeStatus>>28) {
 738     case 1:
 739         return "UTF-7,version=1";
 740     default:
 741         return "UTF-7";
 742     }
 743 }
 744
 745 static const UConverterImpl _UTF7Impl={
 746     UCNV_UTF7,
 747
 748     NULL,
 749     NULL,
 750
 751     _UTF7Open,
 752     NULL,
 753     _UTF7Reset,
 754
 755     _UTF7ToUnicodeWithOffsets,
 756     _UTF7ToUnicodeWithOffsets,
 757     _UTF7FromUnicodeWithOffsets,
 758     _UTF7FromUnicodeWithOffsets,
 759     NULL,
 760
 761     NULL,
 762     _UTF7GetName,
 763     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 764     NULL,
 765     ucnv_getCompleteUnicodeSet,
 766
 767     NULL,
 768     NULL
 769 };
 770
 771 static const UConverterStaticData _UTF7StaticData={
 772     sizeof(UConverterStaticData),
 773     "UTF-7",
 774     0, /* TODO CCSID for UTF-7 */
 775     UCNV_IBM, UCNV_UTF7,
 776     1, 4,
 777     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 778     FALSE, FALSE,
 779     0,
 780     0,
 781     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 782 };
 783
 784 const UConverterSharedData _UTF7Data=
 785         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
 786
 787 /* IMAP mailbox name encoding ----------------------------------------------- */
 788
 789 /*
 790  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 791  * http://www.ietf.org/rfc/rfc2060.txt
 792  *
 793  * 5.1.3.  Mailbox International Naming Convention
 794  *
 795  * By convention, international mailbox names are specified using a
 796  * modified version of the UTF-7 encoding described in [UTF-7].  The
 797  * purpose of these modifications is to correct the following problems
 798  * with UTF-7:
 799  *
 800  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 801  *       the common use of "+" in mailbox names, in particular USENET
 802  *       newsgroup names.
 803  *
 804  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 805  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 806  *
 807  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 808  *       the use of "\" as a popular hierarchy delimiter.
 809  *
 810  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 811  *       the use of "~" in some servers as a home directory indicator.
 812  *
 813  *    5) UTF-7 permits multiple alternate forms to represent the same
 814  *       string; in particular, printable US-ASCII chararacters can be
 815  *       represented in encoded form.
 816  *
 817  * In modified UTF-7, printable US-ASCII characters except for "&"
 818  * represent themselves; that is, characters with octet values 0x20-0x25
 819  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 820  * octet sequence "&-".
 821  *
 822  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 823  * Unicode 16-bit octets) are represented in modified BASE64, with a
 824  * further modification from [UTF-7] that "," is used instead of "/".
 825  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 826  * character which can represent itself.
 827  *
 828  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 829  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 830  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 831  * ").
 832  *
 833  * For example, here is a mailbox name which mixes English, Japanese,
 834  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 835  */
 836
 837 /*
 838  * Tests for US-ASCII characters belonging to character classes
 839  * defined in UTF-7.
 840  *
 841  * Set D (directly encoded characters) consists of the following
 842  * characters: the upper and lower case letters A through Z
 843  * and a through z, the 10 digits 0-9, and the following nine special
 844  * characters (note that "+" and "=" are omitted):
 845  *     '(),-./:?
 846  *
 847  * Set O (optional direct characters) consists of the following
 848  * characters (note that "\" and "~" are omitted):
 849  *     !"#$%&*;<=>@[]^_`{|}
 850  *
 851  * According to the rules in RFC 2152, the byte values for the following
 852  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 853  * - all C0 control codes except for CR LF TAB
 854  * - BACKSLASH
 855  * - TILDE
 856  * - DEL
 857  * - all codes beyond US-ASCII, i.e. all >127
 858  */
 859
 860 /* uses '&' not '+' to start a base64 sequence */
 861 #define AMPERSAND 0x26
 862 #define COMMA 0x2c
 863 #define SLASH 0x2f
 864
 865 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 866 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 867
 868 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 869 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 870
 871 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 872 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 873
 874 /*
 875  * converter status values:
 876  *
 877  * toUnicodeStatus:
 878  *     24 inDirectMode (boolean)
 879  * 23..16 base64Counter (-1..7)
 880  * 15..0  bits (up to 14 bits incoming base64)
 881  *
 882  * fromUnicodeStatus:
 883  *     24 inDirectMode (boolean)
 884  * 23..16 base64Counter (0..2)
 885  *  7..0  bits (6 bits outgoing base64)
 886  *
 887  * ignore bits 31..25
 888  */
 889
 890 static void
 891 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 892                           UErrorCode *pErrorCode) {
 893     UConverter *cnv;
 894     const uint8_t *source, *sourceLimit;
 895     UChar *target;
 896     const UChar *targetLimit;
 897     int32_t *offsets;
 898
 899     uint8_t *bytes;
 900     uint8_t byteIndex;
 901
 902     int32_t length, targetCapacity;
 903
 904     /* UTF-7 state */
 905     uint16_t bits;
 906     int8_t base64Counter;
 907     UBool inDirectMode;
 908
 909     int8_t base64Value;
 910
 911     int32_t sourceIndex, nextSourceIndex;
 912
 913     UChar c;
 914     uint8_t b;
 915
 916     /* set up the local pointers */
 917     cnv=pArgs->converter;
 918
 919     source=(const uint8_t *)pArgs->source;
 920     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 921     target=pArgs->target;
 922     targetLimit=pArgs->targetLimit;
 923     offsets=pArgs->offsets;
 924     /* get the state machine state */
 925     {
 926         uint32_t status=cnv->toUnicodeStatus;
 927         inDirectMode=(UBool)((status>>24)&1);
 928         base64Counter=(int8_t)(status>>16);
 929         bits=(uint16_t)status;
 930     }
 931     bytes=cnv->toUBytes;
 932     byteIndex=cnv->toULength;
 933
 934     /* sourceIndex=-1 if the current character began in the previous buffer */
 935     sourceIndex=byteIndex==0 ? 0 : -1;
 936     nextSourceIndex=0;
 937
 938     if(inDirectMode) {
 939 directMode:
 940         /*
 941          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 942          * with their US-ASCII byte values.
 943          * An ampersand starts Unicode (or "escape") Mode.
 944          *
 945          * In Direct Mode, only the sourceIndex is used.
 946          */
 947         byteIndex=0;
 948         length=(int32_t)(sourceLimit-source);
 949         targetCapacity=(int32_t)(targetLimit-target);
 950         if(length>targetCapacity) {
 951             length=targetCapacity;
 952         }
 953         while(length>0) {
 954             b=*source++;
 955             if(!isLegalIMAP(b)) {
 956                 /* illegal */
 957                 bytes[0]=b;
 958                 byteIndex=1;
 959                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 960                 break;
 961             } else if(b!=AMPERSAND) {
 962                 /* write directly encoded character */
 963                 *target++=b;
 964                 if(offsets!=NULL) {
 965                     *offsets++=sourceIndex++;
 966                 }
 967             } else /* AMPERSAND */ {
 968                 /* switch to Unicode mode */
 969                 nextSourceIndex=++sourceIndex;
 970                 inDirectMode=FALSE;
 971                 byteIndex=0;
 972                 bits=0;
 973                 base64Counter=-1;
 974                 goto unicodeMode;
 975             }
 976             --length;
 977         }
 978         if(source<sourceLimit && target>=targetLimit) {
 979             /* target is full */
 980             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 981         }
 982     } else {
 983 unicodeMode:
 984         /*
 985          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 986          * The base64 sequence ends with any character that is not in the base64 alphabet.
 987          * A terminating minus sign is consumed.
 988          * US-ASCII must not be base64-ed.
 989          *
 990          * In Unicode Mode, the sourceIndex has the index to the start of the current
 991          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 992          * keeping the index to the following byte.
 993          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 994          */
 995         while(source<sourceLimit) {
 996             if(target<targetLimit) {
 997                 bytes[byteIndex++]=b=*source++;
 998                 ++nextSourceIndex;
 999                 if(b>0x7e) {
1000                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1001                     inDirectMode=TRUE;
1002                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1003                     break;
1004                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1005                     /* collect base64 bytes into UChars */
1006                     switch(base64Counter) {
1007                     case -1: /* -1 is immediately after the & */
1008                     case 0:
1009                         bits=base64Value;
1010                         base64Counter=1;
1011                         break;
1012                     case 1:
1013                     case 3:
1014                     case 4:
1015                     case 6:
1016                         bits=(uint16_t)((bits<<6)|base64Value);
1017                         ++base64Counter;
1018                         break;
1019                     case 2:
1020                         c=(UChar)((bits<<4)|(base64Value>>2));
1021                         if(isLegalIMAP(c)) {
1022                             /* illegal */
1023                             inDirectMode=TRUE;
1024                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1025                             goto endloop;
1026                         }
1027                         *target++=c;
1028                         if(offsets!=NULL) {
1029                             *offsets++=sourceIndex;
1030                             sourceIndex=nextSourceIndex-1;
1031                         }
1032                         bytes[0]=b; /* keep this byte in case an error occurs */
1033                         byteIndex=1;
1034                         bits=(uint16_t)(base64Value&3);
1035                         base64Counter=3;
1036                         break;
1037                     case 5:
1038                         c=(UChar)((bits<<2)|(base64Value>>4));
1039                         if(isLegalIMAP(c)) {
1040                             /* illegal */
1041                             inDirectMode=TRUE;
1042                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1043                             goto endloop;
1044                         }
1045                         *target++=c;
1046                         if(offsets!=NULL) {
1047                             *offsets++=sourceIndex;
1048                             sourceIndex=nextSourceIndex-1;
1049                         }
1050                         bytes[0]=b; /* keep this byte in case an error occurs */
1051                         byteIndex=1;
1052                         bits=(uint16_t)(base64Value&15);
1053                         base64Counter=6;
1054                         break;
1055                     case 7:
1056                         c=(UChar)((bits<<6)|base64Value);
1057                         if(isLegalIMAP(c)) {
1058                             /* illegal */
1059                             inDirectMode=TRUE;
1060                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1061                             goto endloop;
1062                         }
1063                         *target++=c;
1064                         if(offsets!=NULL) {
1065                             *offsets++=sourceIndex;
1066                             sourceIndex=nextSourceIndex;
1067                         }
1068                         byteIndex=0;
1069                         bits=0;
1070                         base64Counter=0;
1071                         break;
1072                     default:
1073                         /* will never occur */
1074                         break;
1075                     }
1076                 } else if(base64Value==-2) {
1077                     /* minus sign terminates the base64 sequence */
1078                     inDirectMode=TRUE;
1079                     if(base64Counter==-1) {
1080                         /* &- i.e. a minus immediately following an ampersand */
1081                         *target++=AMPERSAND;
1082                         if(offsets!=NULL) {
1083                             *offsets++=sourceIndex-1;
1084                         }
1085                     } else {
1086                         /* absorb the minus and leave the Unicode Mode */
1087                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1088                             /* bits are illegally left over, a UChar is incomplete */
1089                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1090                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1091                             break;
1092                         }
1093                     }
1094                     sourceIndex=nextSourceIndex;
1095                     goto directMode;
1096                 } else {
1097                     if(base64Counter==-1) {
1098                         /* illegal: & immediately followed by something other than base64 or minus sign */
1099                         /* include the ampersand in the reported sequence */
1100                         --sourceIndex;
1101                         bytes[0]=AMPERSAND;
1102                         bytes[1]=b;
1103                         byteIndex=2;
1104                     }
1105                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1106                     /* base64Value==-3 for illegal characters */
1107                     /* illegal */
1108                     inDirectMode=TRUE;
1109                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1110                     break;
1111                 }
1112             } else {
1113                 /* target is full */
1114                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1115                 break;
1116             }
1117         }
1118     }
1119 endloop:
1120
1121     /*
1122      * the end of the input stream and detection of truncated input
1123      * are handled by the framework, but here we must check if we are in Unicode
1124      * mode and byteIndex==0 because we must end in direct mode
1125      *
1126      * conditions:
1127      *   successful
1128      *   in Unicode mode and byteIndex==0
1129      *   end of input and no truncated input
1130      */
1131     if( U_SUCCESS(*pErrorCode) &&
1132         !inDirectMode && byteIndex==0 &&
1133         pArgs->flush && source>=sourceLimit
1134     ) {
1135         if(base64Counter==-1) {
1136             /* & at the very end of the input */
1137             /* make the ampersand the reported sequence */
1138             bytes[0]=AMPERSAND;
1139             byteIndex=1;
1140         }
1141         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1142
1143         inDirectMode=TRUE; /* avoid looping */
1144         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1145     }
1146
1147     /* set the converter state back into UConverter */
1148     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1149     cnv->toULength=byteIndex;
1150
1151     /* write back the updated pointers */
1152     pArgs->source=(const char *)source;
1153     pArgs->target=target;
1154     pArgs->offsets=offsets;
1155     return;
1156 }
1157
1158 static void
1159 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1160                             UErrorCode *pErrorCode) {
1161     UConverter *cnv;
1162     const UChar *source, *sourceLimit;
1163     uint8_t *target, *targetLimit;
1164     int32_t *offsets;
1165
1166     int32_t length, targetCapacity, sourceIndex;
1167     UChar c;
1168     uint8_t b;
1169
1170     /* UTF-7 state */
1171     uint8_t bits;
1172     int8_t base64Counter;
1173     UBool inDirectMode;
1174
1175     /* set up the local pointers */
1176     cnv=pArgs->converter;
1177
1178     /* set up the local pointers */
1179     source=pArgs->source;
1180     sourceLimit=pArgs->sourceLimit;
1181     target=(uint8_t *)pArgs->target;
1182     targetLimit=(uint8_t *)pArgs->targetLimit;
1183     offsets=pArgs->offsets;
1184
1185     /* get the state machine state */
1186     {
1187         uint32_t status=cnv->fromUnicodeStatus;
1188         inDirectMode=(UBool)((status>>24)&1);
1189         base64Counter=(int8_t)(status>>16);
1190         bits=(uint8_t)status;
1191     }
1192
1193     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1194     sourceIndex=0;
1195
1196     if(inDirectMode) {
1197 directMode:
1198         length=(int32_t)(sourceLimit-source);
1199         targetCapacity=(int32_t)(targetLimit-target);
1200         if(length>targetCapacity) {
1201             length=targetCapacity;
1202         }
1203         while(length>0) {
1204             c=*source++;
1205             /* encode 0x20..0x7e except '&' directly */
1206             if(inSetDIMAP(c)) {
1207                 /* encode directly */
1208                 *target++=(uint8_t)c;
1209                 if(offsets!=NULL) {
1210                     *offsets++=sourceIndex++;
1211                 }
1212             } else if(c==AMPERSAND) {
1213                 /* output &- for & */
1214                 *target++=AMPERSAND;
1215                 if(target<targetLimit) {
1216                     *target++=MINUS;
1217                     if(offsets!=NULL) {
1218                         *offsets++=sourceIndex;
1219                         *offsets++=sourceIndex++;
1220                     }
1221                     /* realign length and targetCapacity */
1222                     goto directMode;
1223                 } else {
1224                     if(offsets!=NULL) {
1225                         *offsets++=sourceIndex++;
1226                     }
1227                     cnv->charErrorBuffer[0]=MINUS;
1228                     cnv->charErrorBufferLength=1;
1229                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1230                     break;
1231                 }
1232             } else {
1233                 /* un-read this character and switch to Unicode Mode */
1234                 --source;
1235                 *target++=AMPERSAND;
1236                 if(offsets!=NULL) {
1237                     *offsets++=sourceIndex;
1238                 }
1239                 inDirectMode=FALSE;
1240                 base64Counter=0;
1241                 goto unicodeMode;
1242             }
1243             --length;
1244         }
1245         if(source<sourceLimit && target>=targetLimit) {
1246             /* target is full */
1247             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248         }
1249     } else {
1250 unicodeMode:
1251         while(source<sourceLimit) {
1252             if(target<targetLimit) {
1253                 c=*source++;
1254                 if(isLegalIMAP(c)) {
1255                     /* encode directly */
1256                     inDirectMode=TRUE;
1257
1258                     /* trick: back out this character to make this easier */
1259                     --source;
1260
1261                     /* terminate the base64 sequence */
1262                     if(base64Counter!=0) {
1263                         /* write remaining bits for the previous character */
1264                         *target++=TO_BASE64_IMAP(bits);
1265                         if(offsets!=NULL) {
1266                             *offsets++=sourceIndex-1;
1267                         }
1268                     }
1269                     /* need to terminate with a minus */
1270                     if(target<targetLimit) {
1271                         *target++=MINUS;
1272                         if(offsets!=NULL) {
1273                             *offsets++=sourceIndex-1;
1274                         }
1275                     } else {
1276                         cnv->charErrorBuffer[0]=MINUS;
1277                         cnv->charErrorBufferLength=1;
1278                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1279                         break;
1280                     }
1281                     goto directMode;
1282                 } else {
1283                     /*
1284                      * base64 this character:
1285                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1286                      * and the bits of this character, each implicitly in UTF-16BE.
1287                      *
1288                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1289                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1290                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1291                      */
1292                     switch(base64Counter) {
1293                     case 0:
1294                         b=(uint8_t)(c>>10);
1295                         *target++=TO_BASE64_IMAP(b);
1296                         if(target<targetLimit) {
1297                             b=(uint8_t)((c>>4)&0x3f);
1298                             *target++=TO_BASE64_IMAP(b);
1299                             if(offsets!=NULL) {
1300                                 *offsets++=sourceIndex;
1301                                 *offsets++=sourceIndex++;
1302                             }
1303                         } else {
1304                             if(offsets!=NULL) {
1305                                 *offsets++=sourceIndex++;
1306                             }
1307                             b=(uint8_t)((c>>4)&0x3f);
1308                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1309                             cnv->charErrorBufferLength=1;
1310                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1311                         }
1312                         bits=(uint8_t)((c&15)<<2);
1313                         base64Counter=1;
1314                         break;
1315                     case 1:
1316                         b=(uint8_t)(bits|(c>>14));
1317                         *target++=TO_BASE64_IMAP(b);
1318                         if(target<targetLimit) {
1319                             b=(uint8_t)((c>>8)&0x3f);
1320                             *target++=TO_BASE64_IMAP(b);
1321                             if(target<targetLimit) {
1322                                 b=(uint8_t)((c>>2)&0x3f);
1323                                 *target++=TO_BASE64_IMAP(b);
1324                                 if(offsets!=NULL) {
1325                                     *offsets++=sourceIndex;
1326                                     *offsets++=sourceIndex;
1327                                     *offsets++=sourceIndex++;
1328                                 }
1329                             } else {
1330                                 if(offsets!=NULL) {
1331                                     *offsets++=sourceIndex;
1332                                     *offsets++=sourceIndex++;
1333                                 }
1334                                 b=(uint8_t)((c>>2)&0x3f);
1335                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1336                                 cnv->charErrorBufferLength=1;
1337                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1338                             }
1339                         } else {
1340                             if(offsets!=NULL) {
1341                                 *offsets++=sourceIndex++;
1342                             }
1343                             b=(uint8_t)((c>>8)&0x3f);
1344                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1345                             b=(uint8_t)((c>>2)&0x3f);
1346                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1347                             cnv->charErrorBufferLength=2;
1348                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1349                         }
1350                         bits=(uint8_t)((c&3)<<4);
1351                         base64Counter=2;
1352                         break;
1353                     case 2:
1354                         b=(uint8_t)(bits|(c>>12));
1355                         *target++=TO_BASE64_IMAP(b);
1356                         if(target<targetLimit) {
1357                             b=(uint8_t)((c>>6)&0x3f);
1358                             *target++=TO_BASE64_IMAP(b);
1359                             if(target<targetLimit) {
1360                                 b=(uint8_t)(c&0x3f);
1361                                 *target++=TO_BASE64_IMAP(b);
1362                                 if(offsets!=NULL) {
1363                                     *offsets++=sourceIndex;
1364                                     *offsets++=sourceIndex;
1365                                     *offsets++=sourceIndex++;
1366                                 }
1367                             } else {
1368                                 if(offsets!=NULL) {
1369                                     *offsets++=sourceIndex;
1370                                     *offsets++=sourceIndex++;
1371                                 }
1372                                 b=(uint8_t)(c&0x3f);
1373                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1374                                 cnv->charErrorBufferLength=1;
1375                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1376                             }
1377                         } else {
1378                             if(offsets!=NULL) {
1379                                 *offsets++=sourceIndex++;
1380                             }
1381                             b=(uint8_t)((c>>6)&0x3f);
1382                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1383                             b=(uint8_t)(c&0x3f);
1384                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1385                             cnv->charErrorBufferLength=2;
1386                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1387                         }
1388                         bits=0;
1389                         base64Counter=0;
1390                         break;
1391                     default:
1392                         /* will never occur */
1393                         break;
1394                     }
1395                 }
1396             } else {
1397                 /* target is full */
1398                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399                 break;
1400             }
1401         }
1402     }
1403
1404     if(pArgs->flush && source>=sourceLimit) {
1405         /* flush remaining bits to the target */
1406         if(!inDirectMode) {
1407             if(base64Counter!=0) {
1408                 if(target<targetLimit) {
1409                     *target++=TO_BASE64_IMAP(bits);
1410                     if(offsets!=NULL) {
1411                         *offsets++=sourceIndex-1;
1412                     }
1413                 } else {
1414                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1415                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1416                 }
1417             }
1418             /* need to terminate with a minus */
1419             if(target<targetLimit) {
1420                 *target++=MINUS;
1421                 if(offsets!=NULL) {
1422                     *offsets++=sourceIndex-1;
1423                 }
1424             } else {
1425                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1426                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1427             }
1428         }
1429         /* reset the state for the next conversion */
1430         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1431     } else {
1432         /* set the converter state back into UConverter */
1433         cnv->fromUnicodeStatus=
1434             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1435             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1436     }
1437
1438     /* write back the updated pointers */
1439     pArgs->source=source;
1440     pArgs->target=(char *)target;
1441     pArgs->offsets=offsets;
1442     return;
1443 }
1444
1445 static const UConverterImpl _IMAPImpl={
1446     UCNV_IMAP_MAILBOX,
1447
1448     NULL,
1449     NULL,
1450
1451     _UTF7Open,
1452     NULL,
1453     _UTF7Reset,
1454
1455     _IMAPToUnicodeWithOffsets,
1456     _IMAPToUnicodeWithOffsets,
1457     _IMAPFromUnicodeWithOffsets,
1458     _IMAPFromUnicodeWithOffsets,
1459     NULL,
1460
1461     NULL,
1462     NULL,
1463     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1464     NULL,
1465     ucnv_getCompleteUnicodeSet
1466 };
1467
1468 static const UConverterStaticData _IMAPStaticData={
1469     sizeof(UConverterStaticData),
1470     "IMAP-mailbox-name",
1471     0, /* TODO CCSID for IMAP-mailbox-name */
1472     UCNV_IBM, UCNV_IMAP_MAILBOX,
1473     1, 4,
1474     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1475     FALSE, FALSE,
1476     0,
1477     0,
1478     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1479 };
1480
1481 const UConverterSharedData _IMAPData=
1482         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1483
1484 #endif