1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2000-2012 Jeffrey Stedfast
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
33 #ifdef HAVE_SYS_PARAM_H
34 #include <sys/param.h> /* for MAXHOSTNAMELEN */
36 #define MAXHOSTNAMELEN 64
38 #ifdef HAVE_UTSNAME_DOMAINNAME
39 #include <sys/utsname.h> /* for uname() */
41 #include <sys/types.h>
42 #include <unistd.h> /* Unix header for getpid() */
53 #include "gmime-utils.h"
54 #include "gmime-table-private.h"
55 #include "gmime-parse-utils.h"
56 #include "gmime-part.h"
57 #include "gmime-charset.h"
58 #include "gmime-iconv.h"
59 #include "gmime-iconv-utils.h"
61 #ifdef ENABLE_WARNINGS
65 #endif /* ENABLE_WARNINGS */
71 * SECTION: gmime-utils
73 * @short_description: MIME utility functions
76 * Utility functions to parse, encode and decode various MIME tokens
80 extern gboolean _g_mime_enable_rfc2047_workarounds (void);
82 #define GMIME_FOLD_PREENCODED (GMIME_FOLD_LEN / 2)
84 /* date parser macros */
85 #define NUMERIC_CHARS "1234567890"
86 #define WEEKDAY_CHARS "SundayMondayTuesdayWednesdayThursdayFridaySaturday"
87 #define MONTH_CHARS "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember"
88 #define TIMEZONE_ALPHA_CHARS "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()"
89 #define TIMEZONE_NUMERIC_CHARS "-+1234567890"
90 #define TIME_CHARS "1234567890:"
92 #define DATE_TOKEN_NON_NUMERIC (1 << 0)
93 #define DATE_TOKEN_NON_WEEKDAY (1 << 1)
94 #define DATE_TOKEN_NON_MONTH (1 << 2)
95 #define DATE_TOKEN_NON_TIME (1 << 3)
96 #define DATE_TOKEN_HAS_COLON (1 << 4)
97 #define DATE_TOKEN_NON_TIMEZONE_ALPHA (1 << 5)
98 #define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6)
99 #define DATE_TOKEN_HAS_SIGN (1 << 7)
101 static unsigned char tohex[16] = {
102 '0', '1', '2', '3', '4', '5', '6', '7',
103 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
106 static unsigned char gmime_datetok_table[256] = {
107 128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
108 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
109 111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111,
110 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111,
111 111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107,
112 79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111,
113 111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105,
114 107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111,
115 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
116 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
117 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
118 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
119 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
120 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
121 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
122 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
125 /* Timezone values defined in rfc5322 */
140 /* Note: rfc822 got the signs backwards for the military
141 * timezones so some sending clients may mistakenly use the
170 static char *tm_months[] = {
171 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
172 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
175 static char *tm_days[] = {
176 "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
181 * g_mime_utils_header_format_date:
182 * @date: time_t date representation
183 * @tz_offset: Timezone offset
185 * Allocates a string buffer containing the rfc822 formatted date
186 * string represented by @time and @tz_offset.
188 * Returns: a valid string representation of the date.
191 g_mime_utils_header_format_date (time_t date, int tz_offset)
195 date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60;
197 #if defined (HAVE_GMTIME_R)
198 gmtime_r (&date, &tm);
199 #elif defined (HAVE_GMTIME_S)
200 gmtime_s (&tm, &date);
202 memcpy (&tm, gmtime (&date), sizeof (tm));
205 return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d",
206 tm_days[tm.tm_wday], tm.tm_mday,
207 tm_months[tm.tm_mon],
209 tm.tm_hour, tm.tm_min, tm.tm_sec,
213 /* This is where it gets ugly... */
215 typedef struct _date_token {
216 struct _date_token *next;
222 #define date_token_free(tok) g_slice_free (date_token, tok)
223 #define date_token_new() g_slice_new (date_token)
226 datetok (const char *date)
228 date_token tokens, *token, *tail;
229 const char *start, *end;
232 tail = (date_token *) &tokens;
237 /* kill leading whitespace */
238 while (*start == ' ' || *start == '\t')
244 mask = gmime_datetok_table[(unsigned char) *start];
246 /* find the end of this token */
248 while (*end && !strchr ("-/,\t\r\n ", *end))
249 mask |= gmime_datetok_table[(unsigned char) *end++];
252 token = date_token_new ();
254 token->start = start;
255 token->len = end - start;
272 decode_int (const char *in, size_t inlen)
274 register const char *inptr;
275 int sign = 1, val = 0;
284 } else if (*inptr == '+')
287 for ( ; inptr < inend; inptr++) {
288 if (!(*inptr >= '0' && *inptr <= '9'))
291 val = (val * 10) + (*inptr - '0');
301 get_days_in_month (int month, int year)
318 if (g_date_is_leap_year (year))
329 get_wday (const char *in, size_t inlen)
333 g_return_val_if_fail (in != NULL, -1);
338 for (wday = 0; wday < 7; wday++) {
339 if (!g_ascii_strncasecmp (in, tm_days[wday], 3))
343 return -1; /* unknown week day */
347 get_mday (const char *in, size_t inlen)
351 g_return_val_if_fail (in != NULL, -1);
353 mday = decode_int (in, inlen);
355 if (mday < 0 || mday > 31)
362 get_month (const char *in, size_t inlen)
366 g_return_val_if_fail (in != NULL, -1);
371 for (i = 0; i < 12; i++) {
372 if (!g_ascii_strncasecmp (in, tm_months[i], 3))
376 return -1; /* unknown month */
380 get_year (const char *in, size_t inlen)
384 g_return_val_if_fail (in != NULL, -1);
386 if ((year = decode_int (in, inlen)) == -1)
390 year += (year < 70) ? 2000 : 1900;
399 get_time (const char *in, size_t inlen, int *hour, int *min, int *sec)
401 register const char *inptr;
402 int *val, colons = 0;
405 *hour = *min = *sec = 0;
409 for (inptr = in; inptr < inend; inptr++) {
422 } else if (!(*inptr >= '0' && *inptr <= '9'))
425 *val = (*val * 10) + (*inptr - '0');
432 get_tzone (date_token **token)
434 const char *inptr, *inend;
438 for (i = 0; *token && i < 2; *token = (*token)->next, i++) {
439 inptr = (*token)->start;
440 inlen = (*token)->len;
441 inend = inptr + inlen;
443 if (*inptr == '+' || *inptr == '-') {
444 return decode_int (inptr, inlen);
448 if (*(inend - 1) == ')')
454 for (t = 0; t < 15; t++) {
455 size_t len = strlen (tz_offsets[t].name);
460 if (!strncmp (inptr, tz_offsets[t].name, len))
461 return tz_offsets[t].offset;
470 mktime_utc (struct tm *tm)
478 #if defined (G_OS_WIN32) && !defined (__MINGW32__)
480 if (tm->tm_isdst > 0) {
486 #elif defined (HAVE_TM_GMTOFF)
488 #elif defined (HAVE_TIMEZONE)
489 if (tm->tm_isdst > 0) {
490 #if defined (HAVE_ALTZONE)
492 #else /* !defined (HAVE_ALTZONE) */
493 tz = (timezone - 3600);
498 #elif defined (HAVE__TIMEZONE)
501 #error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc.
508 parse_rfc822_date (date_token *tokens, int *tzone)
510 int hour, min, sec, offset, n;
515 g_return_val_if_fail (tokens != NULL, (time_t) 0);
519 memset ((void *) &tm, 0, sizeof (struct tm));
521 if ((n = get_wday (token->start, token->len)) != -1) {
522 /* not all dates may have this... */
528 if (!token || (n = get_mday (token->start, token->len)) == -1)
535 if (!token || (n = get_month (token->start, token->len)) == -1)
542 if (!token || (n = get_year (token->start, token->len)) == -1)
545 tm.tm_year = n - 1900;
548 /* get the hour/min/sec */
549 if (!token || !get_time (token->start, token->len, &hour, &min, &sec))
557 /* get the timezone */
558 if (!token || (n = get_tzone (&token)) == -1) {
559 /* I guess we assume tz is GMT? */
565 t = mktime_utc (&tm);
567 /* t is now GMT of the time we want, but not offset by the timezone ... */
569 /* this should convert the time to the GMT equiv time */
570 t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
579 #define date_token_mask(t) (((date_token *) t)->mask)
580 #define is_numeric(t) ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0)
581 #define is_weekday(t) ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0)
582 #define is_month(t) ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0)
583 #define is_time(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON))
584 #define is_tzone_alpha(t) ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0)
585 #define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN))
586 #define is_tzone(t) (is_tzone_alpha (t) || is_tzone_numeric (t))
589 parse_broken_date (date_token *tokens, int *tzone)
591 gboolean got_wday, got_month, got_tzone;
592 int hour, min, sec, offset, n;
597 memset ((void *) &tm, 0, sizeof (struct tm));
598 got_wday = got_month = got_tzone = FALSE;
603 if (is_weekday (token) && !got_wday) {
604 if ((n = get_wday (token->start, token->len)) != -1) {
605 d(printf ("weekday; "));
612 if (is_month (token) && !got_month) {
613 if ((n = get_month (token->start, token->len)) != -1) {
614 d(printf ("month; "));
621 if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) {
622 if (get_time (token->start, token->len, &hour, &min, &sec)) {
623 d(printf ("time; "));
631 if (is_tzone (token) && !got_tzone) {
632 date_token *t = token;
634 if ((n = get_tzone (&t)) != -1) {
635 d(printf ("tzone; "));
642 if (is_numeric (token)) {
643 if (token->len == 4 && !tm.tm_year) {
644 if ((n = get_year (token->start, token->len)) != -1) {
645 d(printf ("year; "));
646 tm.tm_year = n - 1900;
650 /* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */
651 if (!got_month && token->next && is_numeric (token->next)) {
652 if ((n = decode_int (token->start, token->len)) > 12) {
660 } else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) {
662 d(printf ("mday; "));
665 } else if (!tm.tm_year) {
666 if ((n = get_year (token->start, token->len)) != -1) {
667 d(printf ("2-digit year; "));
668 tm.tm_year = n - 1900;
684 t = mktime_utc (&tm);
686 /* t is now GMT of the time we want, but not offset by the timezone ... */
688 /* this should convert the time to the GMT equiv time */
689 t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
699 gmime_datetok_table_init (void)
703 memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table));
705 for (i = 0; i < 256; i++) {
706 if (!strchr (NUMERIC_CHARS, i))
707 gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC;
709 if (!strchr (WEEKDAY_CHARS, i))
710 gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY;
712 if (!strchr (MONTH_CHARS, i))
713 gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH;
715 if (!strchr (TIME_CHARS, i))
716 gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME;
718 if (!strchr (TIMEZONE_ALPHA_CHARS, i))
719 gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA;
721 if (!strchr (TIMEZONE_NUMERIC_CHARS, i))
722 gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC;
724 if (((char) i) == ':')
725 gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON;
727 if (strchr ("+-", i))
728 gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN;
731 printf ("static unsigned char gmime_datetok_table[256] = {");
732 for (i = 0; i < 256; i++) {
735 printf ("%3d,", gmime_datetok_table[i]);
743 * g_mime_utils_header_decode_date:
744 * @str: input date string
745 * @tz_offset: timezone offset
747 * Decodes the rfc822 date string and saves the GMT offset into
748 * @tz_offset if non-NULL.
750 * Returns: the time_t representation of the date string specified by
751 * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value
752 * of the timezone offset will be stored.
755 g_mime_utils_header_decode_date (const char *str, int *tz_offset)
757 date_token *token, *tokens;
760 if (!(tokens = datetok (str))) {
767 if (!(date = parse_rfc822_date (tokens, tz_offset)))
768 date = parse_broken_date (tokens, tz_offset);
773 tokens = tokens->next;
774 date_token_free (token);
782 * g_mime_utils_generate_message_id:
783 * @fqdn: Fully qualified domain name
785 * Generates a unique Message-Id.
787 * Returns: a unique string in an addr-spec format suitable for use as
791 g_mime_utils_generate_message_id (const char *fqdn)
793 #ifdef G_THREADS_ENABLED
794 static GStaticMutex mutex = G_STATIC_MUTEX_INIT;
795 #define MUTEX_LOCK() g_static_mutex_lock (&mutex)
796 #define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex)
799 #define MUTEX_UNLOCK()
801 static unsigned long int count = 0;
802 const char *hostname = NULL;
807 #ifdef HAVE_UTSNAME_DOMAINNAME
812 hostname = unam.nodename;
814 if (unam.domainname[0])
815 name = g_strdup_printf ("%s.%s", hostname, unam.domainname);
816 #else /* ! HAVE_UTSNAME_DOMAINNAME */
817 char host[MAXHOSTNAMELEN + 1];
819 #ifdef HAVE_GETHOSTNAME
820 host[MAXHOSTNAMELEN] = '\0';
821 if (gethostname (host, MAXHOSTNAMELEN) == 0) {
822 #ifdef HAVE_GETDOMAINNAME
823 size_t domainlen = MAXHOSTNAMELEN;
827 domain = g_malloc (domainlen);
829 while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) {
830 domainlen += MAXHOSTNAMELEN;
831 domain = g_realloc (domain, domainlen);
834 if (rv == 0 && domain[0]) {
836 name = g_strdup_printf ("%s.%s", host, domain);
842 #endif /* HAVE_GETDOMAINNAME */
846 #endif /* HAVE_GETHOSTNAME */
848 #endif /* HAVE_UTSNAME_DOMAINNAME */
850 #ifdef HAVE_GETADDRINFO
851 if (!name && hostname[0]) {
852 /* we weren't able to get a domain name */
853 struct addrinfo hints, *res;
855 memset (&hints, 0, sizeof (hints));
856 hints.ai_flags = AI_CANONNAME;
858 if (getaddrinfo (hostname, NULL, &hints, &res) == 0) {
859 name = g_strdup (res->ai_canonname);
863 #endif /* HAVE_GETADDRINFO */
865 fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain");
869 msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL),
870 (unsigned long int) getpid (), count++, fqdn);
879 decode_addrspec (const char **in)
881 const char *word, *inptr;
888 if (!(word = decode_word (&inptr))) {
889 w(g_warning ("No local-part in addr-spec: %s", *in));
893 addrspec = g_string_new ("");
894 g_string_append_len (addrspec, word, (size_t) (inptr - word));
896 /* get the rest of the local-part */
897 decode_lwsp (&inptr);
898 while (*inptr == '.') {
899 g_string_append_c (addrspec, *inptr++);
900 if ((word = decode_word (&inptr))) {
901 g_string_append_len (addrspec, word, (size_t) (inptr - word));
902 decode_lwsp (&inptr);
904 w(g_warning ("Invalid local-part in addr-spec: %s", *in));
909 /* we should be at the '@' now... */
910 if (*inptr++ != '@') {
911 w(g_warning ("Invalid addr-spec; missing '@': %s", *in));
915 g_string_append_c (addrspec, '@');
916 if (!decode_domain (&inptr, addrspec)) {
917 w(g_warning ("No domain in addr-spec: %s", *in));
922 g_string_free (addrspec, FALSE);
930 g_string_free (addrspec, TRUE);
936 decode_msgid (const char **in)
938 const char *inptr = *in;
941 decode_lwsp (&inptr);
943 w(g_warning ("Invalid msg-id; missing '<': %s", *in));
948 decode_lwsp (&inptr);
949 if ((msgid = decode_addrspec (&inptr))) {
950 decode_lwsp (&inptr);
952 w(g_warning ("Invalid msg-id; missing '>': %s", *in));
959 w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in));
961 while (*inptr && *inptr != '>')
964 msgid = g_strndup (*in, (size_t) (inptr - *in));
973 * g_mime_utils_decode_message_id:
974 * @message_id: string containing a message-id
976 * Decodes a msg-id as defined by rfc822.
978 * Returns: the addr-spec portion of the msg-id.
981 g_mime_utils_decode_message_id (const char *message_id)
983 g_return_val_if_fail (message_id != NULL, NULL);
985 return decode_msgid (&message_id);
990 * g_mime_references_decode:
991 * @text: string containing a list of msg-ids
993 * Decodes a list of msg-ids as in the References and/or In-Reply-To
994 * headers defined in rfc822.
996 * Returns: a list of referenced msg-ids.
999 g_mime_references_decode (const char *text)
1001 GMimeReferences refs, *tail, *ref;
1002 const char *word, *inptr = text;
1005 g_return_val_if_fail (text != NULL, NULL);
1007 tail = (GMimeReferences *) &refs;
1011 decode_lwsp (&inptr);
1012 if (*inptr == '<') {
1013 /* looks like a msg-id */
1014 if ((msgid = decode_msgid (&inptr))) {
1015 ref = g_new (GMimeReferences, 1);
1021 w(g_warning ("Invalid References header: %s", inptr));
1024 } else if (*inptr) {
1025 /* looks like part of a phrase */
1026 if (!(word = decode_word (&inptr))) {
1027 w(g_warning ("Invalid References header: %s", inptr));
1038 * g_mime_references_append:
1039 * @refs: the address of a #GMimeReferences list
1040 * @msgid: a message-id string
1042 * Appends a reference to msgid to the list of references.
1045 g_mime_references_append (GMimeReferences **refs, const char *msgid)
1047 GMimeReferences *ref;
1049 g_return_if_fail (refs != NULL);
1050 g_return_if_fail (msgid != NULL);
1052 ref = (GMimeReferences *) refs;
1056 ref->next = g_new (GMimeReferences, 1);
1057 ref->next->msgid = g_strdup (msgid);
1058 ref->next->next = NULL;
1063 * g_mime_references_free:
1064 * @refs: a #GMimeReferences list
1066 * Frees the #GMimeReferences list.
1069 g_mime_references_free (GMimeReferences *refs)
1071 GMimeReferences *ref, *next;
1076 g_free (ref->msgid);
1084 * g_mime_references_clear:
1085 * @refs: address of a #GMimeReferences list
1087 * Clears the #GMimeReferences list and resets it to %NULL.
1090 g_mime_references_clear (GMimeReferences **refs)
1092 g_return_if_fail (refs != NULL);
1094 g_mime_references_free (*refs);
1100 * g_mime_references_get_next:
1101 * @ref: a #GMimeReferences list
1103 * Advances to the next reference node in the #GMimeReferences list.
1105 * Returns: the next reference node in the #GMimeReferences list.
1107 const GMimeReferences *
1108 g_mime_references_get_next (const GMimeReferences *ref)
1110 return ref ? ref->next : NULL;
1115 * g_mime_references_get_message_id:
1116 * @ref: a #GMimeReferences list
1118 * Gets the Message-Id reference from the #GMimeReferences node.
1120 * Returns: the Message-Id reference from the #GMimeReferences node.
1123 g_mime_references_get_message_id (const GMimeReferences *ref)
1125 return ref ? ref->msgid : NULL;
1130 is_rfc2047_token (const char *inptr, size_t len)
1132 if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0)
1138 /* skip past the charset */
1139 while (*inptr != '?' && len > 0) {
1144 if (*inptr != '?' || len < 4)
1147 if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B')
1160 header_fold (const char *in, gboolean structured)
1162 gboolean last_was_lwsp = FALSE;
1163 register const char *inptr;
1164 size_t len, outlen, i;
1171 if (len <= GMIME_FOLD_LEN + 1)
1172 return g_strdup (in);
1174 out = g_string_new ("");
1175 fieldlen = strcspn (inptr, ": \t\n");
1176 g_string_append_len (out, inptr, fieldlen);
1180 while (*inptr && *inptr != '\n') {
1181 len = strcspn (inptr, " \t\n");
1183 if (len > 1 && outlen + len > GMIME_FOLD_LEN) {
1184 if (outlen > 1 && out->len >= fieldlen + 2) {
1185 if (last_was_lwsp) {
1187 out->str[out->len - 1] = '\t';
1189 g_string_insert_c (out, out->len - 1, '\n');
1191 g_string_append (out, "\n\t");
1196 if (!structured && !is_rfc2047_token (inptr, len)) {
1197 /* check for very long words, just cut them up */
1198 while (outlen + len > GMIME_FOLD_LEN) {
1199 for (i = 0; i < GMIME_FOLD_LEN - outlen; i++)
1200 g_string_append_c (out, inptr[i]);
1201 inptr += GMIME_FOLD_LEN - outlen;
1202 len -= GMIME_FOLD_LEN - outlen;
1203 g_string_append (out, "\n\t");
1207 g_string_append_len (out, inptr, len);
1211 last_was_lwsp = FALSE;
1212 } else if (len > 0) {
1213 g_string_append_len (out, inptr, len);
1216 last_was_lwsp = FALSE;
1218 last_was_lwsp = TRUE;
1219 if (*inptr == '\t') {
1220 /* tabs are a good place to fold, odds
1221 are that this is where the previous
1223 g_string_append (out, "\n\t");
1225 while (is_blank (*inptr))
1228 g_string_append_c (out, *inptr++);
1234 if (*inptr == '\n' && out->str[out->len - 1] != '\n')
1235 g_string_append_c (out, '\n');
1238 g_string_free (out, FALSE);
1245 * g_mime_utils_structured_header_fold:
1246 * @str: input string
1248 * Folds a structured header according to the rules in rfc822.
1250 * Returns: an allocated string containing the folded header.
1253 g_mime_utils_structured_header_fold (const char *str)
1255 return header_fold (str, TRUE);
1260 * g_mime_utils_unstructured_header_fold:
1261 * @str: input string
1263 * Folds an unstructured header according to the rules in rfc822.
1265 * Returns: an allocated string containing the folded header.
1268 g_mime_utils_unstructured_header_fold (const char *str)
1270 return header_fold (str, FALSE);
1275 * g_mime_utils_header_fold:
1276 * @str: input string
1278 * Folds a structured header according to the rules in rfc822.
1280 * Returns: an allocated string containing the folded header.
1283 g_mime_utils_header_fold (const char *str)
1285 return header_fold (str, TRUE);
1290 * g_mime_utils_header_printf:
1291 * @format: string format
1292 * @Varargs: arguments
1294 * Allocates a buffer containing a formatted header specified by the
1297 * Returns: an allocated string containing the folded header specified
1298 * by @format and the following arguments.
1301 g_mime_utils_header_printf (const char *format, ...)
1306 va_start (ap, format);
1307 buf = g_strdup_vprintf (format, ap);
1310 ret = header_fold (buf, TRUE);
1317 need_quotes (const char *string)
1319 gboolean quoted = FALSE;
1327 else if (*inptr == '"')
1329 else if (!quoted && (is_tspecial (*inptr) || *inptr == '.'))
1340 * g_mime_utils_quote_string:
1341 * @str: input string
1343 * Quotes @string as needed according to the rules in rfc2045.
1345 * Returns: an allocated string containing the escaped and quoted (if
1346 * needed to be) input string. The decision to quote the string is
1347 * based on whether or not the input string contains any 'tspecials'
1348 * as defined by rfc2045.
1351 g_mime_utils_quote_string (const char *str)
1358 out = g_string_new ("");
1360 if ((quote = need_quotes (str)))
1361 g_string_append_c (out, '"');
1363 for (c = str; *c; c++) {
1364 if ((*c == '"' && quote) || *c == '\\')
1365 g_string_append_c (out, '\\');
1367 g_string_append_c (out, *c);
1371 g_string_append_c (out, '"');
1374 g_string_free (out, FALSE);
1381 * g_mime_utils_unquote_string:
1382 * @str: input string
1384 * Unquotes and unescapes a string.
1387 g_mime_utils_unquote_string (char *str)
1389 /* if the string is quoted, unquote it */
1390 register char *inptr = str;
1391 int escaped = FALSE;
1398 if (*inptr == '\\') {
1404 } else if (*inptr == '"') {
1423 * g_mime_utils_text_is_8bit:
1424 * @text: text to check for 8bit chars
1427 * Determines if @text contains 8bit characters within the first @len
1430 * Returns: %TRUE if the text contains 8bit characters or %FALSE
1434 g_mime_utils_text_is_8bit (const unsigned char *text, size_t len)
1436 register const unsigned char *inptr;
1437 const unsigned char *inend;
1439 g_return_val_if_fail (text != NULL, FALSE);
1442 for (inptr = text; *inptr && inptr < inend; inptr++)
1443 if (*inptr > (unsigned char) 127)
1451 * g_mime_utils_best_encoding:
1452 * @text: text to encode
1455 * Determines the best content encoding for the first @len bytes of
1458 * Returns: a #GMimeContentEncoding that is determined to be the best
1459 * encoding type for the specified block of text. ("best" in this
1460 * particular case means smallest output size)
1462 GMimeContentEncoding
1463 g_mime_utils_best_encoding (const unsigned char *text, size_t len)
1465 const unsigned char *ch, *inend;
1469 for (ch = text; ch < inend; ch++)
1470 if (*ch > (unsigned char) 127)
1473 if ((float) count <= len * 0.17)
1474 return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE;
1476 return GMIME_CONTENT_ENCODING_BASE64;
1482 * @cd: iconv converter
1483 * @inbuf: input text buffer to convert
1484 * @inleft: length of the input buffer
1485 * @outp: pointer to output buffer
1486 * @outlenp: pointer to output buffer length
1487 * @ninval: the number of invalid bytes in @inbuf
1489 * Converts the input buffer from one charset to another using the
1490 * @cd. On completion, @outp will point to the output buffer
1491 * containing the converted text (nul-terminated), @outlenp will be
1492 * the size of the @outp buffer (note: not the strlen() of @outp) and
1493 * @ninval will contain the number of bytes which could not be
1496 * Bytes which cannot be converted from @inbuf will appear as '?'
1497 * characters in the output buffer.
1499 * If *@outp is non-NULL, then it is assumed that it points to a
1500 * pre-allocated buffer of length *@outlenp. This is done so that the
1501 * same output buffer can be reused multiple times.
1503 * Returns: the string length of the output buffer.
1506 charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval)
1508 size_t outlen, outleft, rc, n = 0;
1511 if (*outp == NULL) {
1512 outleft = outlen = (inleft * 2) + 16;
1513 outbuf = out = g_malloc (outlen + 1);
1515 outleft = outlen = *outlenp;
1516 outbuf = out = *outp;
1520 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
1521 if (rc == (size_t) -1) {
1522 if (errno == EINVAL) {
1523 /* incomplete sequence at the end of the input buffer */
1529 /* seems that GnuWin32's libiconv 1.9 does not set errno in
1530 * the E2BIG case, so we have to fake it */
1531 if (outleft <= inleft)
1535 if (errno == E2BIG || outleft == 0) {
1536 /* need to grow the output buffer */
1537 outlen += (inleft * 2) + 16;
1538 rc = (size_t) (outbuf - out);
1539 out = g_realloc (out, outlen + 1);
1540 outleft = outlen - rc;
1544 /* Note: GnuWin32's libiconv 1.9 can also set errno to ERANGE
1545 * which seems to mean that it encountered a character that
1546 * does not fit the specified 'from' charset. We'll handle
1547 * that the same way we handle EILSEQ. */
1548 if (errno == EILSEQ || errno == ERANGE) {
1549 /* invalid or incomplete multibyte
1550 * sequence in the input buffer */
1558 } while (inleft > 0);
1560 while (iconv (cd, NULL, NULL, &outbuf, &outleft) == (size_t) -1) {
1565 rc = (size_t) (outbuf - out);
1566 out = g_realloc (out, outlen + 1);
1567 outleft = outlen - rc;
1577 return (outbuf - out);
1581 #define USER_CHARSETS_INCLUDE_UTF8 (1 << 0)
1582 #define USER_CHARSETS_INCLUDE_LOCALE (1 << 1)
1583 #define USER_CHARSETS_INCLUDE_LATIN1 (1 << 2)
1587 * g_mime_utils_decode_8bit:
1588 * @text: input text in unknown 8bit/multibyte character set
1589 * @len: input text length
1591 * Attempts to convert text in an unknown 8bit/multibyte charset into
1592 * UTF-8 by finding the charset which will convert the most bytes into
1593 * valid UTF-8 characters as possible. If no exact match can be found,
1594 * it will choose the best match and convert invalid byte sequences
1595 * into question-marks (?) in the returned string buffer.
1597 * Returns: a UTF-8 string representation of @text.
1600 g_mime_utils_decode_8bit (const char *text, size_t len)
1602 const char **charsets, **user_charsets, *locale, *best;
1603 size_t outleft, outlen, min, ninval;
1604 unsigned int included = 0;
1609 g_return_val_if_fail (text != NULL, NULL);
1611 locale = g_mime_locale_charset ();
1612 if (!g_ascii_strcasecmp (locale, "iso-8859-1") ||
1613 !g_ascii_strcasecmp (locale, "UTF-8")) {
1614 /* If the user's locale charset is either of these, we
1615 * don't need to include the locale charset in our list
1616 * of fallback charsets. */
1617 included |= USER_CHARSETS_INCLUDE_LOCALE;
1620 if ((user_charsets = g_mime_user_charsets ())) {
1621 while (user_charsets[i])
1625 charsets = g_alloca (sizeof (char *) * (i + 4));
1628 if (user_charsets) {
1629 while (user_charsets[i]) {
1630 /* keep a record of whether or not the user-supplied
1631 * charsets include UTF-8, Latin1, or the user's locale
1632 * charset so that we avoid doubling our efforts for
1633 * these 3 charsets. We could have used a hash table
1634 * to keep track of unique charsets, but we can
1635 * (hopefully) assume that user_charsets is a unique
1636 * list of charsets with no duplicates. */
1637 if (!g_ascii_strcasecmp (user_charsets[i], "iso-8859-1"))
1638 included |= USER_CHARSETS_INCLUDE_LATIN1;
1640 if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8"))
1641 included |= USER_CHARSETS_INCLUDE_UTF8;
1643 if (!g_ascii_strcasecmp (user_charsets[i], locale))
1644 included |= USER_CHARSETS_INCLUDE_LOCALE;
1646 charsets[i] = user_charsets[i];
1651 if (!(included & USER_CHARSETS_INCLUDE_UTF8))
1652 charsets[i++] = "UTF-8";
1654 if (!(included & USER_CHARSETS_INCLUDE_LOCALE))
1655 charsets[i++] = locale;
1657 if (!(included & USER_CHARSETS_INCLUDE_LATIN1))
1658 charsets[i++] = "iso-8859-1";
1665 outleft = (len * 2) + 16;
1666 out = g_malloc (outleft + 1);
1668 for (i = 0; charsets[i]; i++) {
1669 if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
1672 outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1674 g_mime_iconv_close (cd);
1677 return g_realloc (out, outlen + 1);
1685 /* if we get here, then none of the charsets fit the 8bit text flawlessly...
1686 * try to find the one that fit the best and use that to convert what we can,
1687 * replacing any byte we can't convert with a '?' */
1689 if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
1690 /* this shouldn't happen... but if we are here, then
1691 * it did... the only thing we can do at this point
1692 * is replace the 8bit garbage and pray */
1693 register const char *inptr = text;
1694 const char *inend = inptr + len;
1697 while (inptr < inend) {
1698 if (is_ascii (*inptr))
1708 return g_realloc (out, (size_t) (outbuf - out));
1711 outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1713 g_mime_iconv_close (cd);
1715 return g_realloc (out, outlen + 1);
1719 /* this decodes rfc2047's version of quoted-printable */
1721 quoted_decode (const unsigned char *in, size_t len, unsigned char *out, int *state, guint32 *save)
1723 register const unsigned char *inptr;
1724 register unsigned char *outptr;
1725 const unsigned char *inend;
1726 unsigned char c, c1;
1741 if (isxdigit ((int) *inptr)) {
1743 c = g_ascii_toupper ((int) (saved & 0xff));
1744 c1 = g_ascii_toupper ((int) *inptr++);
1757 /* last encoded-word ended in a malformed quoted-printable sequence */
1761 *outptr++ = (char) (saved & 0xff);
1767 while (inptr < inend) {
1771 if (inend - inptr >= 2) {
1772 if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
1773 c = g_ascii_toupper (*inptr++);
1774 c1 = g_ascii_toupper (*inptr++);
1776 *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
1777 | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
1779 /* malformed quoted-printable sequence? */
1783 /* truncated payload, maybe it was split across encoded-words? */
1784 if (inptr < inend) {
1785 if (isxdigit ((int) *inptr)) {
1790 /* malformed quoted-printable sequence? */
1799 } else if (c == '_') {
1800 /* _'s are an rfc2047 shortcut for encoding spaces */
1810 return (size_t) (outptr - out);
1813 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
1815 typedef struct _rfc2047_token {
1816 struct _rfc2047_token *next;
1817 const char *charset;
1824 #define rfc2047_token_list_free(tokens) g_slice_free_chain (rfc2047_token, tokens, next)
1825 #define rfc2047_token_free(token) g_slice_free (rfc2047_token, token)
1827 static rfc2047_token *
1828 rfc2047_token_new (const char *text, size_t len)
1830 rfc2047_token *token;
1832 token = g_slice_new0 (rfc2047_token);
1833 token->length = len;
1839 static rfc2047_token *
1840 rfc2047_token_new_encoded_word (const char *word, size_t len)
1842 rfc2047_token *token;
1843 const char *payload;
1844 const char *charset;
1850 /* check that this could even be an encoded-word token */
1851 if (len < 7 || strncmp (word, "=?", 2) != 0 || strncmp (word + len - 2, "?=", 2) != 0)
1854 /* skip over '=?' */
1858 if (*charset == '?' || *charset == '*') {
1859 /* this would result in an empty charset */
1863 /* skip to the end of the charset */
1864 if (!(inptr = memchr (inptr, '?', len - 2)) || inptr[2] != '?')
1867 /* copy the charset into a buffer */
1868 n = (size_t) (inptr - charset);
1869 buf = g_alloca (n + 1);
1870 memcpy (buf, charset, n);
1874 /* rfc2231 updates rfc2047 encoded words...
1875 * The ABNF given in RFC 2047 for encoded-words is:
1876 * encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
1877 * This specification changes this ABNF to:
1878 * encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
1881 /* trim off the 'language' part if it's there... */
1882 if ((lang = strchr (charset, '*')))
1885 /* skip over the '?' */
1888 /* make sure the first char after the encoding is another '?' */
1889 if (inptr[1] != '?')
1903 /* the payload begins right after the '?' */
1904 payload = inptr + 1;
1906 /* find the end of the payload */
1907 inptr = word + len - 2;
1909 /* make sure that we don't have something like: =?iso-8859-1?Q?= */
1910 if (payload > inptr)
1913 token = rfc2047_token_new (payload, inptr - payload);
1914 token->charset = g_mime_charset_iconv_name (charset);
1915 token->encoding = encoding;
1920 static rfc2047_token *
1921 tokenize_rfc2047_phrase (const char *in, size_t *len)
1923 gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
1924 rfc2047_token list, *lwsp, *token, *tail;
1925 register const char *inptr = in;
1926 gboolean encoded = FALSE;
1927 const char *text, *word;
1931 tail = (rfc2047_token *) &list;
1935 while (*inptr != '\0') {
1937 while (is_lwsp (*inptr))
1941 lwsp = rfc2047_token_new (text, inptr - text);
1947 if (is_atom (*inptr)) {
1948 if (G_UNLIKELY (enable_rfc2047_workarounds)) {
1949 /* Make an extra effort to detect and
1950 * separate encoded-word tokens that
1951 * have been merged with other
1954 if (!strncmp (inptr, "=?", 2)) {
1957 /* skip past the charset (if one is even declared, sigh) */
1958 while (*inptr && *inptr != '?') {
1959 ascii = ascii && is_ascii (*inptr);
1963 /* sanity check encoding type */
1964 if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
1969 /* find the end of the rfc2047 encoded word token */
1970 while (*inptr && strncmp (inptr, "?=", 2) != 0) {
1971 ascii = ascii && is_ascii (*inptr);
1975 if (*inptr == '\0') {
1976 /* didn't find an end marker... */
1986 /* stop if we encounter a possible rfc2047 encoded
1987 * token even if it's inside another word, sigh. */
1988 while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
1992 while (is_atom (*inptr))
1996 n = (size_t) (inptr - word);
1997 if ((token = rfc2047_token_new_encoded_word (word, n))) {
1998 /* rfc2047 states that you must ignore all
1999 * whitespace between encoded words */
2000 if (!encoded && lwsp != NULL) {
2003 } else if (lwsp != NULL) {
2004 rfc2047_token_free (lwsp);
2012 /* append the lwsp and atom tokens */
2018 token = rfc2047_token_new (word, n);
2019 token->is_8bit = ascii ? 0 : 1;
2026 /* append the lwsp token */
2033 while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
2034 ascii = ascii && is_ascii (*inptr);
2038 n = (size_t) (inptr - word);
2039 token = rfc2047_token_new (word, n);
2040 token->is_8bit = ascii ? 0 : 1;
2049 *len = (size_t) (inptr - in);
2054 static rfc2047_token *
2055 tokenize_rfc2047_text (const char *in, size_t *len)
2057 gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
2058 rfc2047_token list, *lwsp, *token, *tail;
2059 register const char *inptr = in;
2060 gboolean encoded = FALSE;
2061 const char *text, *word;
2065 tail = (rfc2047_token *) &list;
2069 while (*inptr != '\0') {
2071 while (is_lwsp (*inptr))
2075 lwsp = rfc2047_token_new (text, inptr - text);
2079 if (*inptr != '\0') {
2083 if (G_UNLIKELY (enable_rfc2047_workarounds)) {
2084 if (!strncmp (inptr, "=?", 2)) {
2087 /* skip past the charset (if one is even declared, sigh) */
2088 while (*inptr && *inptr != '?') {
2089 ascii = ascii && is_ascii (*inptr);
2093 /* sanity check encoding type */
2094 if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
2099 /* find the end of the rfc2047 encoded word token */
2100 while (*inptr && strncmp (inptr, "?=", 2) != 0) {
2101 ascii = ascii && is_ascii (*inptr);
2105 if (*inptr == '\0') {
2106 /* didn't find an end marker... */
2116 /* stop if we encounter a possible rfc2047 encoded
2117 * token even if it's inside another word, sigh. */
2118 while (*inptr && !is_lwsp (*inptr) &&
2119 strncmp (inptr, "=?", 2) != 0) {
2120 ascii = ascii && is_ascii (*inptr);
2125 while (*inptr && !is_lwsp (*inptr)) {
2126 ascii = ascii && is_ascii (*inptr);
2131 n = (size_t) (inptr - word);
2132 if ((token = rfc2047_token_new_encoded_word (word, n))) {
2133 /* rfc2047 states that you must ignore all
2134 * whitespace between encoded words */
2135 if (!encoded && lwsp != NULL) {
2138 } else if (lwsp != NULL) {
2139 rfc2047_token_free (lwsp);
2147 /* append the lwsp and atom tokens */
2153 token = rfc2047_token_new (word, n);
2154 token->is_8bit = ascii ? 0 : 1;
2162 /* appending trailing lwsp */
2171 *len = (size_t) (inptr - in);
2177 rfc2047_token_decode (rfc2047_token *token, unsigned char *outbuf, int *state, guint32 *save)
2179 const unsigned char *inbuf = (const unsigned char *) token->text;
2180 size_t len = token->length;
2182 if (token->encoding == 'B')
2183 return g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
2185 return quoted_decode (inbuf, len, outbuf, state, save);
2189 rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
2191 rfc2047_token *token, *next;
2192 size_t outlen, ninval, len;
2193 unsigned char *outptr;
2194 const char *charset;
2203 decoded = g_string_sized_new (buflen + 1);
2204 outbuf = g_byte_array_sized_new (76);
2207 while (token != NULL) {
2210 if (token->encoding) {
2211 /* In order to work around broken mailers, we need to combine
2212 * the raw decoded content of runs of identically encoded word
2213 * tokens before converting into UTF-8. */
2214 encoding = token->encoding;
2215 charset = token->charset;
2216 len = token->length;
2220 /* find the end of the run (and measure the buffer length we'll need) */
2221 while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
2222 len += next->length;
2226 /* make sure our temporary output buffer is large enough... */
2227 if (len > outbuf->len)
2228 g_byte_array_set_size (outbuf, len);
2230 /* base64 / quoted-printable decode each of the tokens... */
2231 outptr = outbuf->data;
2234 /* Note: by not resetting state/save each loop, we effectively
2235 * treat the payloads as one continuous block, thus allowing
2236 * us to handle cases where a hex-encoded triplet of a
2237 * quoted-printable encoded payload is split between 2 or more
2238 * encoded-word tokens. */
2239 len = rfc2047_token_decode (token, outptr, &state, &save);
2240 token = token->next;
2243 } while (token != next);
2244 outptr = outbuf->data;
2246 /* convert the raw decoded text into UTF-8 */
2247 if (!g_ascii_strcasecmp (charset, "UTF-8")) {
2248 /* slight optimization over going thru iconv */
2249 str = (char *) outptr;
2252 while (!g_utf8_validate (str, len, (const char **) &str)) {
2253 len = outlen - (str - (char *) outptr);
2257 g_string_append_len (decoded, (char *) outptr, outlen);
2258 } else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
2259 w(g_warning ("Cannot convert from %s to UTF-8, header display may "
2260 "be corrupt: %s", charset[0] ? charset : "unspecified charset",
2261 g_strerror (errno)));
2263 str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
2264 g_string_append (decoded, str);
2267 str = g_malloc (outlen + 1);
2270 len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
2271 g_mime_iconv_close (cd);
2273 g_string_append_len (decoded, str, len);
2278 g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
2279 "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
2283 } else if (token->is_8bit) {
2284 /* *sigh* I hate broken mailers... */
2285 str = g_mime_utils_decode_8bit (token->text, token->length);
2286 g_string_append (decoded, str);
2289 g_string_append_len (decoded, token->text, token->length);
2295 g_byte_array_free (outbuf, TRUE);
2297 return g_string_free (decoded, FALSE);
2302 * g_mime_utils_header_decode_text:
2303 * @text: header text to decode
2305 * Decodes an rfc2047 encoded 'text' header.
2307 * Note: See g_mime_set_user_charsets() for details on how charset
2308 * conversion is handled for unencoded 8bit text and/or wrongly
2309 * specified rfc2047 encoded-word tokens.
2311 * Returns: a newly allocated UTF-8 string representing the the decoded
2315 g_mime_utils_header_decode_text (const char *text)
2317 rfc2047_token *tokens;
2322 return g_strdup ("");
2324 tokens = tokenize_rfc2047_text (text, &len);
2325 decoded = rfc2047_decode_tokens (tokens, len);
2326 rfc2047_token_list_free (tokens);
2333 * g_mime_utils_header_decode_phrase:
2334 * @phrase: header to decode
2336 * Decodes an rfc2047 encoded 'phrase' header.
2338 * Note: See g_mime_set_user_charsets() for details on how charset
2339 * conversion is handled for unencoded 8bit text and/or wrongly
2340 * specified rfc2047 encoded-word tokens.
2342 * Returns: a newly allocated UTF-8 string representing the the decoded
2346 g_mime_utils_header_decode_phrase (const char *phrase)
2348 rfc2047_token *tokens;
2353 return g_strdup ("");
2355 tokens = tokenize_rfc2047_phrase (phrase, &len);
2356 decoded = rfc2047_decode_tokens (tokens, len);
2357 rfc2047_token_list_free (tokens);
2363 /* rfc2047 version of quoted-printable */
2365 quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask)
2367 register const unsigned char *inptr = (const unsigned char *) in;
2368 const unsigned char *inend = inptr + len;
2369 register unsigned char *outptr = out;
2372 while (inptr < inend) {
2376 } else if (c != '_' && gmime_special_table[c] & safemask) {
2380 *outptr++ = tohex[(c >> 4) & 0xf];
2381 *outptr++ = tohex[c & 0xf];
2385 return (outptr - out);
2389 rfc2047_encode_word (GString *string, const char *word, size_t len,
2390 const char *charset, gushort safemask)
2392 register char *inptr, *outptr;
2393 iconv_t cd = (iconv_t) -1;
2394 unsigned char *encoded;
2401 if (g_ascii_strcasecmp (charset, "UTF-8") != 0)
2402 cd = g_mime_iconv_open (charset, "UTF-8");
2404 if (cd != (iconv_t) -1) {
2405 uword = g_mime_iconv_strndup (cd, (char *) word, len);
2406 g_mime_iconv_close (cd);
2410 len = strlen (uword);
2416 switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) {
2417 case GMIME_CONTENT_ENCODING_BASE64:
2418 enclen = GMIME_BASE64_ENCODE_LEN (len);
2419 encoded = g_alloca (enclen + 1);
2423 pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save);
2424 encoded[pos] = '\0';
2426 /* remove \n chars as headers need to be wrapped differently */
2427 if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) {
2429 while (G_LIKELY (*inptr)) {
2430 if (G_LIKELY (*inptr != '\n'))
2440 case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
2441 enclen = GMIME_QP_ENCODE_LEN (len);
2442 encoded = g_alloca (enclen + 1);
2446 pos = quoted_encode (word, len, encoded, safemask);
2447 encoded[pos] = '\0';
2453 g_assert_not_reached ();
2458 g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded);
2468 typedef struct _rfc822_word {
2469 struct _rfc822_word *next;
2470 const char *start, *end;
2475 #define rfc822_word_free(word) g_slice_free (rfc822_word, word)
2476 #define rfc822_word_new() g_slice_new (rfc822_word)
2478 /* okay, so 'unstructured text' fields don't actually contain 'word'
2479 * tokens, but we can group stuff similarly... */
2480 static rfc822_word *
2481 rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
2483 rfc822_word words, *tail, *word;
2484 rfc822_word_t type = WORD_ATOM;
2485 const char *inptr, *start, *last;
2486 int count = 0, encoding = 0;
2488 tail = (rfc822_word *) &words;
2491 last = start = inptr = in;
2492 while (inptr && *inptr) {
2493 const char *newinptr;
2496 newinptr = g_utf8_next_char (inptr);
2497 c = g_utf8_get_char (inptr);
2498 if (newinptr == NULL || !g_unichar_validate (c)) {
2499 w(g_warning ("Invalid UTF-8 sequence encountered"));
2506 if (c < 256 && is_blank (c)) {
2508 word = rfc822_word_new ();
2510 word->start = start;
2513 word->encoding = encoding;
2528 encoding = MAX (encoding, 1);
2529 } else if (phrase && !is_atom (c)) {
2530 /* phrases can have qstring words */
2531 type = MAX (type, WORD_QSTRING);
2533 } else if (c < 256) {
2535 encoding = MAX (encoding, 1);
2541 if (count >= GMIME_FOLD_PREENCODED) {
2542 if (type == WORD_ATOM)
2545 word = rfc822_word_new ();
2547 word->start = start;
2550 word->encoding = encoding;
2556 /* Note: don't reset 'type' as it
2557 * needs to be preserved when breaking
2568 word = rfc822_word_new ();
2570 word->start = start;
2573 word->encoding = encoding;
2580 printf ("rfc822 word tokens:\n");
2583 printf ("\t'%.*s'; type=%d, encoding=%d\n",
2584 word->end - word->start, word->start,
2585 word->type, word->encoding);
2594 #define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))
2597 should_merge_words (rfc822_word *word, rfc822_word *next)
2599 switch (word->type) {
2601 if (next->type == WORD_2047)
2604 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type));
2606 /* avoid merging with words that need to be rfc2047 encoded */
2607 if (next->type == WORD_2047)
2610 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING));
2612 if (next->type == WORD_ATOM) {
2613 /* whether we merge or not is dependent upon:
2614 * 1. the number of atoms in a row after 'word'
2615 * 2. if there is another encword after the string of atoms.
2619 while (next && next->type == WORD_ATOM) {
2624 /* if all the words after the encword are atoms, don't merge */
2625 if (!next || natoms > 3)
2629 /* avoid merging with qstrings */
2630 if (next->type == WORD_QSTRING)
2633 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047));
2640 rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp)
2642 rfc822_word *word, *next, *words = *wordsp;
2644 /* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */
2646 while (word && word->next) {
2649 if (word->type != WORD_ATOM && word->type == next->type &&
2650 MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) {
2651 /* merge the words */
2652 word->encoding = MAX (word->encoding, next->encoding);
2654 word->end = next->end;
2655 word->next = next->next;
2657 rfc822_word_free (next);
2665 /* second pass: now merge atoms with the other words */
2667 while (word && word->next) {
2670 if (should_merge_words (word, next)) {
2671 /* the resulting word type is the MAX of the 2 types */
2672 word->type = MAX (word->type, next->type);
2674 word->encoding = MAX (word->encoding, next->encoding);
2676 word->end = next->end;
2677 word->next = next->next;
2679 rfc822_word_free (next);
2691 g_string_append_len_quoted (GString *out, const char *in, size_t len)
2693 register const char *inptr;
2696 g_string_append_c (out, '"');
2701 while (inptr < inend) {
2702 if (*inptr == '"' || *inptr == '\\')
2703 g_string_append_c (out, '\\');
2705 g_string_append_c (out, *inptr);
2710 g_string_append_c (out, '"');
2714 rfc2047_encode (const char *in, gushort safemask)
2716 rfc822_word *words, *word, *prev = NULL;
2717 const char **charsets, *charset;
2725 if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE)))
2726 return g_strdup (in);
2728 rfc2047_encode_merge_rfc822_words (&words);
2730 charsets = g_mime_user_charsets ();
2732 out = g_string_new ("");
2734 /* output words now with spaces between them */
2737 /* append correct number of spaces between words */
2738 if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) {
2739 /* one or both of the words are not encoded so we write the spaces out untouched */
2740 len = word->start - prev->end;
2741 g_string_append_len (out, prev->end, len);
2744 switch (word->type) {
2746 g_string_append_len (out, word->start, (size_t) (word->end - word->start));
2749 g_assert (safemask & IS_PSAFE);
2750 g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start));
2753 if (prev && prev->type == WORD_2047) {
2754 /* include the whitespace chars between these 2 words in the
2755 resulting rfc2047 encoded word. */
2756 len = word->end - prev->end;
2759 /* encoded words need to be separated by linear whitespace */
2760 g_string_append_c (out, ' ');
2762 len = word->end - word->start;
2763 start = word->start;
2766 switch (word->encoding) {
2767 case 0: /* us-ascii */
2768 rfc2047_encode_word (out, start, len, "us-ascii", safemask);
2770 case 1: /* iso-8859-1 */
2771 rfc2047_encode_word (out, start, len, "iso-8859-1", safemask);
2775 g_mime_charset_init (&mask);
2776 g_mime_charset_step (&mask, start, len);
2778 for (i = 0; charsets && charsets[i]; i++) {
2779 if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) {
2780 charset = charsets[i];
2786 charset = g_mime_charset_best_name (&mask);
2788 rfc2047_encode_word (out, start, len, charset, safemask);
2795 rfc822_word_free (prev);
2801 rfc822_word_free (prev);
2804 g_string_free (out, FALSE);
2811 * g_mime_utils_header_encode_phrase:
2812 * @phrase: phrase to encode
2814 * Encodes a 'phrase' header according to the rules in rfc2047.
2816 * Returns: the encoded 'phrase'. Useful for encoding internet
2820 g_mime_utils_header_encode_phrase (const char *phrase)
2825 return rfc2047_encode (phrase, IS_PSAFE);
2830 * g_mime_utils_header_encode_text:
2831 * @text: text to encode
2833 * Encodes a 'text' header according to the rules in rfc2047.
2835 * Returns: the encoded header. Useful for encoding
2836 * headers like "Subject".
2839 g_mime_utils_header_encode_text (const char *text)
2844 return rfc2047_encode (text, IS_ESAFE);