1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
24 #include "unicode/ucnv.h"
29 /* UTF-7 -------------------------------------------------------------------- */
32 * UTF-7 is a stateful encoding of Unicode.
33 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34 * It was intended for use in Internet email systems, using in its bytewise
35 * encoding only a subset of 7-bit US-ASCII.
36 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
39 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40 * characters directly or in base64. Especially, the characters in set O
41 * as defined in the RFC (see below) may be encoded directly but are not
42 * allowed in, e.g., email headers.
43 * By default, the ICU UTF-7 converter encodes set O directly.
44 * By choosing the option "version=1", set O will be escaped instead.
46 * utf7Converter=ucnv_open("UTF-7,version=1");
48 * For details about email headers see RFC 2047.
52 * Tests for US-ASCII characters belonging to character classes
55 * Set D (directly encoded characters) consists of the following
56 * characters: the upper and lower case letters A through Z
57 * and a through z, the 10 digits 0-9, and the following nine special
58 * characters (note that "+" and "=" are omitted):
61 * Set O (optional direct characters) consists of the following
62 * characters (note that "\" and "~" are omitted):
63 * !"#$%&*;<=>@[]^_`{|}
65 * According to the rules in RFC 2152, the byte values for the following
66 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67 * - all C0 control codes except for CR LF TAB
71 * - all codes beyond US-ASCII, i.e. all >127
74 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75 (uint8_t)((c)-48)<10 || /* digits */ \
76 (uint8_t)((c)-39)<3 || /* '() */ \
77 (uint8_t)((c)-44)<4 || /* ,-./ */ \
78 (c)==58 || (c)==63 /* :? */ \
82 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
83 (uint8_t)((c)-59)<4 || /* ;<=> */ \
84 (uint8_t)((c)-93)<4 || /* ]^_` */ \
85 (uint8_t)((c)-123)<3 || /* {|} */ \
86 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
100 /* encode directly sets D and O and CR LF SP TAB */
101 static const UBool encodeDirectlyMaximum[128]={
102 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
116 /* encode directly set D and CR LF SP TAB but not set O */
117 static const UBool encodeDirectlyRestricted[128]={
118 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
135 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
138 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
141 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
148 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
152 /* general punctuation with + and / and a special value (-2) for - */
153 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
155 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
158 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
159 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
162 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
167 * converter status values:
170 * 24 inDirectMode (boolean)
171 * 23..16 base64Counter (-1..7)
172 * 15..0 bits (up to 14 bits incoming base64)
175 * 31..28 version (0: set O direct 1: set O escaped)
176 * 24 inDirectMode (boolean)
177 * 23..16 base64Counter (0..2)
178 * 7..0 bits (6 bits outgoing base64)
183 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
184 if(choice<=UCNV_RESET_TO_UNICODE) {
185 /* reset toUnicode */
186 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
189 if(choice!=UCNV_RESET_TO_UNICODE) {
190 /* reset fromUnicode */
191 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
196 _UTF7Open(UConverter *cnv,
197 UConverterLoadArgs *pArgs,
198 UErrorCode *pErrorCode) {
199 if(UCNV_GET_VERSION(cnv)<=1) {
200 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
201 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
202 _UTF7Reset(cnv, UCNV_RESET_BOTH);
204 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
209 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
210 UErrorCode *pErrorCode) {
212 const uint8_t *source, *sourceLimit;
214 const UChar *targetLimit;
220 int32_t length, targetCapacity;
224 int8_t base64Counter;
229 int32_t sourceIndex, nextSourceIndex;
232 /* set up the local pointers */
233 cnv=pArgs->converter;
235 source=(const uint8_t *)pArgs->source;
236 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
237 target=pArgs->target;
238 targetLimit=pArgs->targetLimit;
239 offsets=pArgs->offsets;
240 /* get the state machine state */
242 uint32_t status=cnv->toUnicodeStatus;
243 inDirectMode=(UBool)((status>>24)&1);
244 base64Counter=(int8_t)(status>>16);
245 bits=(uint16_t)status;
248 byteIndex=cnv->toULength;
250 /* sourceIndex=-1 if the current character began in the previous buffer */
251 sourceIndex=byteIndex==0 ? 0 : -1;
257 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
258 * with their US-ASCII byte values.
259 * Backslash and Tilde and most control characters are not allowed in UTF-7.
260 * A plus sign starts Unicode (or "escape") Mode.
262 * In Direct Mode, only the sourceIndex is used.
265 length=(int32_t)(sourceLimit-source);
266 targetCapacity=(int32_t)(targetLimit-target);
267 if(length>targetCapacity) {
268 length=targetCapacity;
272 if(!isLegalUTF7(b)) {
276 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279 /* write directly encoded character */
282 *offsets++=sourceIndex++;
285 /* switch to Unicode mode */
286 nextSourceIndex=++sourceIndex;
295 if(source<sourceLimit && target>=targetLimit) {
297 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
302 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
303 * The base64 sequence ends with any character that is not in the base64 alphabet.
304 * A terminating minus sign is consumed.
306 * In Unicode Mode, the sourceIndex has the index to the start of the current
307 * base64 bytes, while nextSourceIndex is precisely parallel to source,
308 * keeping the index to the following byte.
309 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
311 while(source<sourceLimit) {
312 if(target<targetLimit) {
313 bytes[byteIndex++]=b=*source++;
315 base64Value = -3; /* initialize as illegal */
316 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
318 * base64Value==-1 for any legal character except base64 and minus sign, or
319 * base64Value==-3 for illegal characters:
320 * 1. In either case, leave Unicode mode.
321 * 2.1. If we ended with an incomplete UChar or none after the +, then
322 * generate an error for the preceding erroneous sequence and deal with
323 * the current (possibly illegal) character next time through.
324 * 2.2. Else the current char comes after a complete UChar, which was already
325 * pushed to the output buf, so:
326 * 2.2.1. If the current char is legal, just save it for processing next time.
327 * It may be for example, a plus which we need to deal with in direct mode.
328 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331 if(base64Counter==-1) {
332 /* illegal: + immediately followed by something other than base64 or minus sign */
333 /* include the plus sign in the reported sequence, but not the subsequent char */
337 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340 /* bits are illegally left over, a UChar is incomplete */
341 /* don't include current char (legal or illegal) in error seq */
344 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 /* previous UChar was complete */
348 if(base64Value==-3) {
349 /* current character is illegal, deal with it here */
350 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353 /* un-read the current character in case it is a plus sign */
355 sourceIndex=nextSourceIndex-1;
359 } else if(base64Value>=0) {
360 /* collect base64 bytes into UChars */
361 switch(base64Counter) {
362 case -1: /* -1 is immediately after the + */
371 bits=(uint16_t)((bits<<6)|base64Value);
375 *target++=(UChar)((bits<<4)|(base64Value>>2));
377 *offsets++=sourceIndex;
378 sourceIndex=nextSourceIndex-1;
380 bytes[0]=b; /* keep this byte in case an error occurs */
382 bits=(uint16_t)(base64Value&3);
386 *target++=(UChar)((bits<<2)|(base64Value>>4));
388 *offsets++=sourceIndex;
389 sourceIndex=nextSourceIndex-1;
391 bytes[0]=b; /* keep this byte in case an error occurs */
393 bits=(uint16_t)(base64Value&15);
397 *target++=(UChar)((bits<<6)|base64Value);
399 *offsets++=sourceIndex;
400 sourceIndex=nextSourceIndex;
407 /* will never occur */
410 } else /*base64Value==-2*/ {
411 /* minus sign terminates the base64 sequence */
413 if(base64Counter==-1) {
414 /* +- i.e. a minus immediately following a plus */
417 *offsets++=sourceIndex-1;
420 /* absorb the minus and leave the Unicode Mode */
422 /* bits are illegally left over, a UChar is incomplete */
423 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
427 sourceIndex=nextSourceIndex;
432 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
438 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
440 * if we are in Unicode mode, then the byteIndex might not be 0,
441 * but that is ok if bits==0
442 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
443 * (not true for IMAP-mailbox-name where we must end in direct mode)
448 /* set the converter state back into UConverter */
449 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
450 cnv->toULength=byteIndex;
452 /* write back the updated pointers */
453 pArgs->source=(const char *)source;
454 pArgs->target=target;
455 pArgs->offsets=offsets;
460 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
461 UErrorCode *pErrorCode) {
463 const UChar *source, *sourceLimit;
464 uint8_t *target, *targetLimit;
467 int32_t length, targetCapacity, sourceIndex;
471 const UBool *encodeDirectly;
473 int8_t base64Counter;
476 /* set up the local pointers */
477 cnv=pArgs->converter;
479 /* set up the local pointers */
480 source=pArgs->source;
481 sourceLimit=pArgs->sourceLimit;
482 target=(uint8_t *)pArgs->target;
483 targetLimit=(uint8_t *)pArgs->targetLimit;
484 offsets=pArgs->offsets;
486 /* get the state machine state */
488 uint32_t status=cnv->fromUnicodeStatus;
489 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
490 inDirectMode=(UBool)((status>>24)&1);
491 base64Counter=(int8_t)(status>>16);
492 bits=(uint8_t)status;
493 U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
496 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
501 length=(int32_t)(sourceLimit-source);
502 targetCapacity=(int32_t)(targetLimit-target);
503 if(length>targetCapacity) {
504 length=targetCapacity;
508 /* currently always encode CR LF SP TAB directly */
509 if(c<=127 && encodeDirectly[c]) {
510 /* encode directly */
511 *target++=(uint8_t)c;
513 *offsets++=sourceIndex++;
516 /* output +- for + */
518 if(target<targetLimit) {
521 *offsets++=sourceIndex;
522 *offsets++=sourceIndex++;
524 /* realign length and targetCapacity */
528 *offsets++=sourceIndex++;
530 cnv->charErrorBuffer[0]=MINUS;
531 cnv->charErrorBufferLength=1;
532 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
536 /* un-read this character and switch to Unicode Mode */
540 *offsets++=sourceIndex;
548 if(source<sourceLimit && target>=targetLimit) {
550 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
554 while(source<sourceLimit) {
555 if(target<targetLimit) {
557 if(c<=127 && encodeDirectly[c]) {
558 /* encode directly */
561 /* trick: back out this character to make this easier */
564 /* terminate the base64 sequence */
565 if(base64Counter!=0) {
566 /* write remaining bits for the previous character */
567 *target++=toBase64[bits];
569 *offsets++=sourceIndex-1;
572 if(fromBase64[c]!=-1) {
573 /* need to terminate with a minus */
574 if(target<targetLimit) {
577 *offsets++=sourceIndex-1;
580 cnv->charErrorBuffer[0]=MINUS;
581 cnv->charErrorBufferLength=1;
582 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
589 * base64 this character:
590 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
591 * and the bits of this character, each implicitly in UTF-16BE.
593 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
594 * character to the next. The actual 2 or 4 bits are shifted to the left edge
595 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
597 switch(base64Counter) {
599 *target++=toBase64[c>>10];
600 if(target<targetLimit) {
601 *target++=toBase64[(c>>4)&0x3f];
603 *offsets++=sourceIndex;
604 *offsets++=sourceIndex++;
608 *offsets++=sourceIndex++;
610 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
611 cnv->charErrorBufferLength=1;
612 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614 bits=(uint8_t)((c&15)<<2);
618 *target++=toBase64[bits|(c>>14)];
619 if(target<targetLimit) {
620 *target++=toBase64[(c>>8)&0x3f];
621 if(target<targetLimit) {
622 *target++=toBase64[(c>>2)&0x3f];
624 *offsets++=sourceIndex;
625 *offsets++=sourceIndex;
626 *offsets++=sourceIndex++;
630 *offsets++=sourceIndex;
631 *offsets++=sourceIndex++;
633 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
634 cnv->charErrorBufferLength=1;
635 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
639 *offsets++=sourceIndex++;
641 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
642 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
643 cnv->charErrorBufferLength=2;
644 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
646 bits=(uint8_t)((c&3)<<4);
650 *target++=toBase64[bits|(c>>12)];
651 if(target<targetLimit) {
652 *target++=toBase64[(c>>6)&0x3f];
653 if(target<targetLimit) {
654 *target++=toBase64[c&0x3f];
656 *offsets++=sourceIndex;
657 *offsets++=sourceIndex;
658 *offsets++=sourceIndex++;
662 *offsets++=sourceIndex;
663 *offsets++=sourceIndex++;
665 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
666 cnv->charErrorBufferLength=1;
667 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
671 *offsets++=sourceIndex++;
673 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
674 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
675 cnv->charErrorBufferLength=2;
676 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
682 /* will never occur */
688 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
694 if(pArgs->flush && source>=sourceLimit) {
695 /* flush remaining bits to the target */
697 if (base64Counter!=0) {
698 if(target<targetLimit) {
699 *target++=toBase64[bits];
701 *offsets++=sourceIndex-1;
704 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
705 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
708 /* Add final MINUS to terminate unicodeMode */
709 if(target<targetLimit) {
712 *offsets++=sourceIndex-1;
715 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
716 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
719 /* reset the state for the next conversion */
720 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
722 /* set the converter state back into UConverter */
723 cnv->fromUnicodeStatus=
724 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
725 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
728 /* write back the updated pointers */
729 pArgs->source=source;
730 pArgs->target=(char *)target;
731 pArgs->offsets=offsets;
736 _UTF7GetName(const UConverter *cnv) {
737 switch(cnv->fromUnicodeStatus>>28) {
739 return "UTF-7,version=1";
745 static const UConverterImpl _UTF7Impl={
755 _UTF7ToUnicodeWithOffsets,
756 _UTF7ToUnicodeWithOffsets,
757 _UTF7FromUnicodeWithOffsets,
758 _UTF7FromUnicodeWithOffsets,
763 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
765 ucnv_getCompleteUnicodeSet,
771 static const UConverterStaticData _UTF7StaticData={
772 sizeof(UConverterStaticData),
774 0, /* TODO CCSID for UTF-7 */
777 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
781 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
784 const UConverterSharedData _UTF7Data=
785 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
787 /* IMAP mailbox name encoding ----------------------------------------------- */
790 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
791 * http://www.ietf.org/rfc/rfc2060.txt
793 * 5.1.3. Mailbox International Naming Convention
795 * By convention, international mailbox names are specified using a
796 * modified version of the UTF-7 encoding described in [UTF-7]. The
797 * purpose of these modifications is to correct the following problems
800 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
801 * the common use of "+" in mailbox names, in particular USENET
804 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
805 * conflicts with the use of "/" as a popular hierarchy delimiter.
807 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
808 * the use of "\" as a popular hierarchy delimiter.
810 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
811 * the use of "~" in some servers as a home directory indicator.
813 * 5) UTF-7 permits multiple alternate forms to represent the same
814 * string; in particular, printable US-ASCII chararacters can be
815 * represented in encoded form.
817 * In modified UTF-7, printable US-ASCII characters except for "&"
818 * represent themselves; that is, characters with octet values 0x20-0x25
819 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
820 * octet sequence "&-".
822 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
823 * Unicode 16-bit octets) are represented in modified BASE64, with a
824 * further modification from [UTF-7] that "," is used instead of "/".
825 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
826 * character which can represent itself.
828 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
829 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
830 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
833 * For example, here is a mailbox name which mixes English, Japanese,
834 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
838 * Tests for US-ASCII characters belonging to character classes
841 * Set D (directly encoded characters) consists of the following
842 * characters: the upper and lower case letters A through Z
843 * and a through z, the 10 digits 0-9, and the following nine special
844 * characters (note that "+" and "=" are omitted):
847 * Set O (optional direct characters) consists of the following
848 * characters (note that "\" and "~" are omitted):
849 * !"#$%&*;<=>@[]^_`{|}
851 * According to the rules in RFC 2152, the byte values for the following
852 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
853 * - all C0 control codes except for CR LF TAB
857 * - all codes beyond US-ASCII, i.e. all >127
860 /* uses '&' not '+' to start a base64 sequence */
861 #define AMPERSAND 0x26
865 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
866 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
868 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
869 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
871 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
872 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
875 * converter status values:
878 * 24 inDirectMode (boolean)
879 * 23..16 base64Counter (-1..7)
880 * 15..0 bits (up to 14 bits incoming base64)
883 * 24 inDirectMode (boolean)
884 * 23..16 base64Counter (0..2)
885 * 7..0 bits (6 bits outgoing base64)
891 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
892 UErrorCode *pErrorCode) {
894 const uint8_t *source, *sourceLimit;
896 const UChar *targetLimit;
902 int32_t length, targetCapacity;
906 int8_t base64Counter;
911 int32_t sourceIndex, nextSourceIndex;
916 /* set up the local pointers */
917 cnv=pArgs->converter;
919 source=(const uint8_t *)pArgs->source;
920 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
921 target=pArgs->target;
922 targetLimit=pArgs->targetLimit;
923 offsets=pArgs->offsets;
924 /* get the state machine state */
926 uint32_t status=cnv->toUnicodeStatus;
927 inDirectMode=(UBool)((status>>24)&1);
928 base64Counter=(int8_t)(status>>16);
929 bits=(uint16_t)status;
932 byteIndex=cnv->toULength;
934 /* sourceIndex=-1 if the current character began in the previous buffer */
935 sourceIndex=byteIndex==0 ? 0 : -1;
941 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
942 * with their US-ASCII byte values.
943 * An ampersand starts Unicode (or "escape") Mode.
945 * In Direct Mode, only the sourceIndex is used.
948 length=(int32_t)(sourceLimit-source);
949 targetCapacity=(int32_t)(targetLimit-target);
950 if(length>targetCapacity) {
951 length=targetCapacity;
955 if(!isLegalIMAP(b)) {
959 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
961 } else if(b!=AMPERSAND) {
962 /* write directly encoded character */
965 *offsets++=sourceIndex++;
967 } else /* AMPERSAND */ {
968 /* switch to Unicode mode */
969 nextSourceIndex=++sourceIndex;
978 if(source<sourceLimit && target>=targetLimit) {
980 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
986 * The base64 sequence ends with any character that is not in the base64 alphabet.
987 * A terminating minus sign is consumed.
988 * US-ASCII must not be base64-ed.
990 * In Unicode Mode, the sourceIndex has the index to the start of the current
991 * base64 bytes, while nextSourceIndex is precisely parallel to source,
992 * keeping the index to the following byte.
993 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
995 while(source<sourceLimit) {
996 if(target<targetLimit) {
997 bytes[byteIndex++]=b=*source++;
1000 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1002 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1004 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1005 /* collect base64 bytes into UChars */
1006 switch(base64Counter) {
1007 case -1: /* -1 is immediately after the & */
1016 bits=(uint16_t)((bits<<6)|base64Value);
1020 c=(UChar)((bits<<4)|(base64Value>>2));
1021 if(isLegalIMAP(c)) {
1024 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029 *offsets++=sourceIndex;
1030 sourceIndex=nextSourceIndex-1;
1032 bytes[0]=b; /* keep this byte in case an error occurs */
1034 bits=(uint16_t)(base64Value&3);
1038 c=(UChar)((bits<<2)|(base64Value>>4));
1039 if(isLegalIMAP(c)) {
1042 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047 *offsets++=sourceIndex;
1048 sourceIndex=nextSourceIndex-1;
1050 bytes[0]=b; /* keep this byte in case an error occurs */
1052 bits=(uint16_t)(base64Value&15);
1056 c=(UChar)((bits<<6)|base64Value);
1057 if(isLegalIMAP(c)) {
1060 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065 *offsets++=sourceIndex;
1066 sourceIndex=nextSourceIndex;
1073 /* will never occur */
1076 } else if(base64Value==-2) {
1077 /* minus sign terminates the base64 sequence */
1079 if(base64Counter==-1) {
1080 /* &- i.e. a minus immediately following an ampersand */
1081 *target++=AMPERSAND;
1083 *offsets++=sourceIndex-1;
1086 /* absorb the minus and leave the Unicode Mode */
1087 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1088 /* bits are illegally left over, a UChar is incomplete */
1089 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1090 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1094 sourceIndex=nextSourceIndex;
1097 if(base64Counter==-1) {
1098 /* illegal: & immediately followed by something other than base64 or minus sign */
1099 /* include the ampersand in the reported sequence */
1105 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1106 /* base64Value==-3 for illegal characters */
1109 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1113 /* target is full */
1114 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1122 * the end of the input stream and detection of truncated input
1123 * are handled by the framework, but here we must check if we are in Unicode
1124 * mode and byteIndex==0 because we must end in direct mode
1128 * in Unicode mode and byteIndex==0
1129 * end of input and no truncated input
1131 if( U_SUCCESS(*pErrorCode) &&
1132 !inDirectMode && byteIndex==0 &&
1133 pArgs->flush && source>=sourceLimit
1135 if(base64Counter==-1) {
1136 /* & at the very end of the input */
1137 /* make the ampersand the reported sequence */
1141 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1143 inDirectMode=TRUE; /* avoid looping */
1144 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1147 /* set the converter state back into UConverter */
1148 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1149 cnv->toULength=byteIndex;
1151 /* write back the updated pointers */
1152 pArgs->source=(const char *)source;
1153 pArgs->target=target;
1154 pArgs->offsets=offsets;
1159 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1160 UErrorCode *pErrorCode) {
1162 const UChar *source, *sourceLimit;
1163 uint8_t *target, *targetLimit;
1166 int32_t length, targetCapacity, sourceIndex;
1172 int8_t base64Counter;
1175 /* set up the local pointers */
1176 cnv=pArgs->converter;
1178 /* set up the local pointers */
1179 source=pArgs->source;
1180 sourceLimit=pArgs->sourceLimit;
1181 target=(uint8_t *)pArgs->target;
1182 targetLimit=(uint8_t *)pArgs->targetLimit;
1183 offsets=pArgs->offsets;
1185 /* get the state machine state */
1187 uint32_t status=cnv->fromUnicodeStatus;
1188 inDirectMode=(UBool)((status>>24)&1);
1189 base64Counter=(int8_t)(status>>16);
1190 bits=(uint8_t)status;
1193 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1198 length=(int32_t)(sourceLimit-source);
1199 targetCapacity=(int32_t)(targetLimit-target);
1200 if(length>targetCapacity) {
1201 length=targetCapacity;
1205 /* encode 0x20..0x7e except '&' directly */
1207 /* encode directly */
1208 *target++=(uint8_t)c;
1210 *offsets++=sourceIndex++;
1212 } else if(c==AMPERSAND) {
1213 /* output &- for & */
1214 *target++=AMPERSAND;
1215 if(target<targetLimit) {
1218 *offsets++=sourceIndex;
1219 *offsets++=sourceIndex++;
1221 /* realign length and targetCapacity */
1225 *offsets++=sourceIndex++;
1227 cnv->charErrorBuffer[0]=MINUS;
1228 cnv->charErrorBufferLength=1;
1229 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1233 /* un-read this character and switch to Unicode Mode */
1235 *target++=AMPERSAND;
1237 *offsets++=sourceIndex;
1245 if(source<sourceLimit && target>=targetLimit) {
1246 /* target is full */
1247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1251 while(source<sourceLimit) {
1252 if(target<targetLimit) {
1254 if(isLegalIMAP(c)) {
1255 /* encode directly */
1258 /* trick: back out this character to make this easier */
1261 /* terminate the base64 sequence */
1262 if(base64Counter!=0) {
1263 /* write remaining bits for the previous character */
1264 *target++=TO_BASE64_IMAP(bits);
1266 *offsets++=sourceIndex-1;
1269 /* need to terminate with a minus */
1270 if(target<targetLimit) {
1273 *offsets++=sourceIndex-1;
1276 cnv->charErrorBuffer[0]=MINUS;
1277 cnv->charErrorBufferLength=1;
1278 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1284 * base64 this character:
1285 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1286 * and the bits of this character, each implicitly in UTF-16BE.
1288 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1289 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1290 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1292 switch(base64Counter) {
1295 *target++=TO_BASE64_IMAP(b);
1296 if(target<targetLimit) {
1297 b=(uint8_t)((c>>4)&0x3f);
1298 *target++=TO_BASE64_IMAP(b);
1300 *offsets++=sourceIndex;
1301 *offsets++=sourceIndex++;
1305 *offsets++=sourceIndex++;
1307 b=(uint8_t)((c>>4)&0x3f);
1308 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1309 cnv->charErrorBufferLength=1;
1310 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1312 bits=(uint8_t)((c&15)<<2);
1316 b=(uint8_t)(bits|(c>>14));
1317 *target++=TO_BASE64_IMAP(b);
1318 if(target<targetLimit) {
1319 b=(uint8_t)((c>>8)&0x3f);
1320 *target++=TO_BASE64_IMAP(b);
1321 if(target<targetLimit) {
1322 b=(uint8_t)((c>>2)&0x3f);
1323 *target++=TO_BASE64_IMAP(b);
1325 *offsets++=sourceIndex;
1326 *offsets++=sourceIndex;
1327 *offsets++=sourceIndex++;
1331 *offsets++=sourceIndex;
1332 *offsets++=sourceIndex++;
1334 b=(uint8_t)((c>>2)&0x3f);
1335 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1336 cnv->charErrorBufferLength=1;
1337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1341 *offsets++=sourceIndex++;
1343 b=(uint8_t)((c>>8)&0x3f);
1344 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1345 b=(uint8_t)((c>>2)&0x3f);
1346 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1347 cnv->charErrorBufferLength=2;
1348 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1350 bits=(uint8_t)((c&3)<<4);
1354 b=(uint8_t)(bits|(c>>12));
1355 *target++=TO_BASE64_IMAP(b);
1356 if(target<targetLimit) {
1357 b=(uint8_t)((c>>6)&0x3f);
1358 *target++=TO_BASE64_IMAP(b);
1359 if(target<targetLimit) {
1360 b=(uint8_t)(c&0x3f);
1361 *target++=TO_BASE64_IMAP(b);
1363 *offsets++=sourceIndex;
1364 *offsets++=sourceIndex;
1365 *offsets++=sourceIndex++;
1369 *offsets++=sourceIndex;
1370 *offsets++=sourceIndex++;
1372 b=(uint8_t)(c&0x3f);
1373 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1374 cnv->charErrorBufferLength=1;
1375 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1379 *offsets++=sourceIndex++;
1381 b=(uint8_t)((c>>6)&0x3f);
1382 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1383 b=(uint8_t)(c&0x3f);
1384 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1385 cnv->charErrorBufferLength=2;
1386 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1392 /* will never occur */
1397 /* target is full */
1398 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1404 if(pArgs->flush && source>=sourceLimit) {
1405 /* flush remaining bits to the target */
1407 if(base64Counter!=0) {
1408 if(target<targetLimit) {
1409 *target++=TO_BASE64_IMAP(bits);
1411 *offsets++=sourceIndex-1;
1414 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1415 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1418 /* need to terminate with a minus */
1419 if(target<targetLimit) {
1422 *offsets++=sourceIndex-1;
1425 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1426 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1429 /* reset the state for the next conversion */
1430 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1432 /* set the converter state back into UConverter */
1433 cnv->fromUnicodeStatus=
1434 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1435 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1438 /* write back the updated pointers */
1439 pArgs->source=source;
1440 pArgs->target=(char *)target;
1441 pArgs->offsets=offsets;
1445 static const UConverterImpl _IMAPImpl={
1455 _IMAPToUnicodeWithOffsets,
1456 _IMAPToUnicodeWithOffsets,
1457 _IMAPFromUnicodeWithOffsets,
1458 _IMAPFromUnicodeWithOffsets,
1463 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1465 ucnv_getCompleteUnicodeSet
1468 static const UConverterStaticData _IMAPStaticData={
1469 sizeof(UConverterStaticData),
1470 "IMAP-mailbox-name",
1471 0, /* TODO CCSID for IMAP-mailbox-name */
1472 UCNV_IBM, UCNV_IMAP_MAILBOX,
1474 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1478 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1481 const UConverterSharedData _IMAPData=
1482 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);