gmime/gmime-utils.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*  GMime
   3  *  Copyright (C) 2000-2012 Jeffrey Stedfast
   4  *
   5  *  This library is free software; you can redistribute it and/or
   6  *  modify it under the terms of the GNU Lesser General Public License
   7  *  as published by the Free Software Foundation; either version 2.1
   8  *  of the License, or (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  *  Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public
  16  *  License along with this library; if not, write to the Free
  17  *  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  18  *  02110-1301, USA.
  19  */
  20
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #define _GNU_SOURCE
  27
  28 #include <glib.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <string.h>
  33 #ifdef HAVE_SYS_PARAM_H
  34 #include <sys/param.h>      /* for MAXHOSTNAMELEN */
  35 #else
  36 #define MAXHOSTNAMELEN 64
  37 #endif
  38 #ifdef HAVE_UTSNAME_DOMAINNAME
  39 #include <sys/utsname.h>    /* for uname() */
  40 #endif
  41 #include <sys/types.h>
  42 #include <unistd.h>         /* Unix header for getpid() */
  43 #ifdef G_OS_WIN32
  44 #include <winsock2.h>
  45 #include <ws2tcpip.h>
  46 #endif
  47 #ifdef HAVE_NETDB_H
  48 #include <netdb.h>
  49 #endif
  50 #include <ctype.h>
  51 #include <errno.h>
  52
  53 #include "gmime-utils.h"
  54 #include "gmime-table-private.h"
  55 #include "gmime-parse-utils.h"
  56 #include "gmime-part.h"
  57 #include "gmime-charset.h"
  58 #include "gmime-iconv.h"
  59 #include "gmime-iconv-utils.h"
  60
  61 #ifdef ENABLE_WARNINGS
  62 #define w(x) x
  63 #else
  64 #define w(x)
  65 #endif /* ENABLE_WARNINGS */
  66
  67 #define d(x)
  68
  69
  70 /**
  71  * SECTION: gmime-utils
  72  * @title: gmime-utils
  73  * @short_description: MIME utility functions
  74  * @see_also:
  75  *
  76  * Utility functions to parse, encode and decode various MIME tokens
  77  * and encodings.
  78  **/
  79
  80 extern gboolean _g_mime_enable_rfc2047_workarounds (void);
  81
  82 #define GMIME_FOLD_PREENCODED  (GMIME_FOLD_LEN / 2)
  83
  84 /* date parser macros */
  85 #define NUMERIC_CHARS          "1234567890"
  86 #define WEEKDAY_CHARS          "SundayMondayTuesdayWednesdayThursdayFridaySaturday"
  87 #define MONTH_CHARS            "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember"
  88 #define TIMEZONE_ALPHA_CHARS   "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()"
  89 #define TIMEZONE_NUMERIC_CHARS "-+1234567890"
  90 #define TIME_CHARS             "1234567890:"
  91
  92 #define DATE_TOKEN_NON_NUMERIC          (1 << 0)
  93 #define DATE_TOKEN_NON_WEEKDAY          (1 << 1)
  94 #define DATE_TOKEN_NON_MONTH            (1 << 2)
  95 #define DATE_TOKEN_NON_TIME             (1 << 3)
  96 #define DATE_TOKEN_HAS_COLON            (1 << 4)
  97 #define DATE_TOKEN_NON_TIMEZONE_ALPHA   (1 << 5)
  98 #define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6)
  99 #define DATE_TOKEN_HAS_SIGN             (1 << 7)
 100
 101 static unsigned char tohex[16] = {
 102         '0', '1', '2', '3', '4', '5', '6', '7',
 103         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
 104 };
 105
 106 static unsigned char gmime_datetok_table[256] = {
 107         128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 108         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 109         111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111,
 110          38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111,
 111         111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107,
 112          79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111,
 113         111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105,
 114         107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111,
 115         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 116         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 117         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 118         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 119         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 120         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 121         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 122         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 123 };
 124
 125 /* Timezone values defined in rfc5322 */
 126 static struct {
 127         const char *name;
 128         int offset;
 129 } tz_offsets [] = {
 130         { "UT",       0 },
 131         { "GMT",      0 },
 132         { "EDT",   -400 },
 133         { "EST",   -500 },
 134         { "CDT",   -500 },
 135         { "CST",   -600 },
 136         { "MDT",   -600 },
 137         { "MST",   -700 },
 138         { "PDT",   -700 },
 139         { "PST",   -800 },
 140         /* Note: rfc822 got the signs backwards for the military
 141          * timezones so some sending clients may mistakenly use the
 142          * wrong values. */
 143         { "A",      100 },
 144         { "B",      200 },
 145         { "C",      300 },
 146         { "D",      400 },
 147         { "E",      500 },
 148         { "F",      600 },
 149         { "G",      700 },
 150         { "H",      800 },
 151         { "I",      900 },
 152         { "K",     1000 },
 153         { "L",     1100 },
 154         { "M",     1200 },
 155         { "N",     -100 },
 156         { "O",     -200 },
 157         { "P",     -300 },
 158         { "Q",     -400 },
 159         { "R",     -500 },
 160         { "S",     -600 },
 161         { "T",     -700 },
 162         { "U",     -800 },
 163         { "V",     -900 },
 164         { "W",    -1000 },
 165         { "X",    -1100 },
 166         { "Y",    -1200 },
 167         { "Z",        0 },
 168 };
 169
 170 static char *tm_months[] = {
 171         "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 172         "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 173 };
 174
 175 static char *tm_days[] = {
 176         "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 177 };
 178
 179
 180 /**
 181  * g_mime_utils_header_format_date:
 182  * @date: time_t date representation
 183  * @tz_offset: Timezone offset
 184  *
 185  * Allocates a string buffer containing the rfc822 formatted date
 186  * string represented by @time and @tz_offset.
 187  *
 188  * Returns: a valid string representation of the date.
 189  **/
 190 char *
 191 g_mime_utils_header_format_date (time_t date, int tz_offset)
 192 {
 193         struct tm tm;
 194
 195         date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60;
 196
 197 #if defined (HAVE_GMTIME_R)
 198         gmtime_r (&date, &tm);
 199 #elif defined (HAVE_GMTIME_S)
 200         gmtime_s (&tm, &date);
 201 #else
 202         memcpy (&tm, gmtime (&date), sizeof (tm));
 203 #endif
 204
 205         return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d",
 206                                 tm_days[tm.tm_wday], tm.tm_mday,
 207                                 tm_months[tm.tm_mon],
 208                                 tm.tm_year + 1900,
 209                                 tm.tm_hour, tm.tm_min, tm.tm_sec,
 210                                 tz_offset);
 211 }
 212
 213 /* This is where it gets ugly... */
 214
 215 typedef struct _date_token {
 216         struct _date_token *next;
 217         unsigned char mask;
 218         const char *start;
 219         size_t len;
 220 } date_token;
 221
 222 #define date_token_free(tok) g_slice_free (date_token, tok)
 223 #define date_token_new() g_slice_new (date_token)
 224
 225 static date_token *
 226 datetok (const char *date)
 227 {
 228         date_token tokens, *token, *tail;
 229         const char *start, *end;
 230         unsigned char mask;
 231
 232         tail = (date_token *) &tokens;
 233         tokens.next = NULL;
 234
 235         start = date;
 236         while (*start) {
 237                 /* kill leading whitespace */
 238                 while (*start == ' ' || *start == '\t')
 239                         start++;
 240
 241                 if (*start == '\0')
 242                         break;
 243
 244                 mask = gmime_datetok_table[(unsigned char) *start];
 245
 246                 /* find the end of this token */
 247                 end = start + 1;
 248                 while (*end && !strchr ("-/,\t\r\n ", *end))
 249                         mask |= gmime_datetok_table[(unsigned char) *end++];
 250
 251                 if (end != start) {
 252                         token = date_token_new ();
 253                         token->next = NULL;
 254                         token->start = start;
 255                         token->len = end - start;
 256                         token->mask = mask;
 257
 258                         tail->next = token;
 259                         tail = token;
 260                 }
 261
 262                 if (*end)
 263                         start = end + 1;
 264                 else
 265                         break;
 266         }
 267
 268         return tokens.next;
 269 }
 270
 271 static int
 272 decode_int (const char *in, size_t inlen)
 273 {
 274         register const char *inptr;
 275         int sign = 1, val = 0;
 276         const char *inend;
 277
 278         inptr = in;
 279         inend = in + inlen;
 280
 281         if (*inptr == '-') {
 282                 sign = -1;
 283                 inptr++;
 284         } else if (*inptr == '+')
 285                 inptr++;
 286
 287         for ( ; inptr < inend; inptr++) {
 288                 if (!(*inptr >= '0' && *inptr <= '9'))
 289                         return -1;
 290                 else
 291                         val = (val * 10) + (*inptr - '0');
 292         }
 293
 294         val *= sign;
 295
 296         return val;
 297 }
 298
 299 #if 0
 300 static int
 301 get_days_in_month (int month, int year)
 302 {
 303         switch (month) {
 304         case 1:
 305         case 3:
 306         case 5:
 307         case 7:
 308         case 8:
 309         case 10:
 310         case 12:
 311                 return 31;
 312         case 4:
 313         case 6:
 314         case 9:
 315         case 11:
 316                 return 30;
 317         case 2:
 318                 if (g_date_is_leap_year (year))
 319                         return 29;
 320                 else
 321                         return 28;
 322         default:
 323                 return 0;
 324         }
 325 }
 326 #endif
 327
 328 static int
 329 get_wday (const char *in, size_t inlen)
 330 {
 331         int wday;
 332
 333         g_return_val_if_fail (in != NULL, -1);
 334
 335         if (inlen < 3)
 336                 return -1;
 337
 338         for (wday = 0; wday < 7; wday++) {
 339                 if (!g_ascii_strncasecmp (in, tm_days[wday], 3))
 340                         return wday;
 341         }
 342
 343         return -1;  /* unknown week day */
 344 }
 345
 346 static int
 347 get_mday (const char *in, size_t inlen)
 348 {
 349         int mday;
 350
 351         g_return_val_if_fail (in != NULL, -1);
 352
 353         mday = decode_int (in, inlen);
 354
 355         if (mday < 0 || mday > 31)
 356                 mday = -1;
 357
 358         return mday;
 359 }
 360
 361 static int
 362 get_month (const char *in, size_t inlen)
 363 {
 364         int i;
 365
 366         g_return_val_if_fail (in != NULL, -1);
 367
 368         if (inlen < 3)
 369                 return -1;
 370
 371         for (i = 0; i < 12; i++) {
 372                 if (!g_ascii_strncasecmp (in, tm_months[i], 3))
 373                         return i;
 374         }
 375
 376         return -1;  /* unknown month */
 377 }
 378
 379 static int
 380 get_year (const char *in, size_t inlen)
 381 {
 382         int year;
 383
 384         g_return_val_if_fail (in != NULL, -1);
 385
 386         if ((year = decode_int (in, inlen)) == -1)
 387                 return -1;
 388
 389         if (year < 100)
 390                 year += (year < 70) ? 2000 : 1900;
 391
 392         if (year < 1969)
 393                 return -1;
 394
 395         return year;
 396 }
 397
 398 static gboolean
 399 get_time (const char *in, size_t inlen, int *hour, int *min, int *sec)
 400 {
 401         register const char *inptr;
 402         int *val, colons = 0;
 403         const char *inend;
 404
 405         *hour = *min = *sec = 0;
 406
 407         inend = in + inlen;
 408         val = hour;
 409         for (inptr = in; inptr < inend; inptr++) {
 410                 if (*inptr == ':') {
 411                         colons++;
 412                         switch (colons) {
 413                         case 1:
 414                                 val = min;
 415                                 break;
 416                         case 2:
 417                                 val = sec;
 418                                 break;
 419                         default:
 420                                 return FALSE;
 421                         }
 422                 } else if (!(*inptr >= '0' && *inptr <= '9'))
 423                         return FALSE;
 424                 else
 425                         *val = (*val * 10) + (*inptr - '0');
 426         }
 427
 428         return TRUE;
 429 }
 430
 431 static int
 432 get_tzone (date_token **token)
 433 {
 434         const char *inptr, *inend;
 435         size_t inlen;
 436         int i, t;
 437
 438         for (i = 0; *token && i < 2; *token = (*token)->next, i++) {
 439                 inptr = (*token)->start;
 440                 inlen = (*token)->len;
 441                 inend = inptr + inlen;
 442
 443                 if (*inptr == '+' || *inptr == '-') {
 444                         return decode_int (inptr, inlen);
 445                 } else {
 446                         if (*inptr == '(') {
 447                                 inptr++;
 448                                 if (*(inend - 1) == ')')
 449                                         inlen -= 2;
 450                                 else
 451                                         inlen--;
 452                         }
 453
 454                         for (t = 0; t < 15; t++) {
 455                                 size_t len = strlen (tz_offsets[t].name);
 456
 457                                 if (len != inlen)
 458                                         continue;
 459
 460                                 if (!strncmp (inptr, tz_offsets[t].name, len))
 461                                         return tz_offsets[t].offset;
 462                         }
 463                 }
 464         }
 465
 466         return -1;
 467 }
 468
 469 static time_t
 470 mktime_utc (struct tm *tm)
 471 {
 472         time_t tt;
 473         long tz;
 474
 475         tm->tm_isdst = -1;
 476         tt = mktime (tm);
 477
 478 #if defined (G_OS_WIN32) && !defined (__MINGW32__)
 479         _get_timezone (&tz);
 480         if (tm->tm_isdst > 0) {
 481                 int dst;
 482
 483                 _get_dstbias (&dst);
 484                 tz += dst;
 485         }
 486 #elif defined (HAVE_TM_GMTOFF)
 487         tz = -tm->tm_gmtoff;
 488 #elif defined (HAVE_TIMEZONE)
 489         if (tm->tm_isdst > 0) {
 490 #if defined (HAVE_ALTZONE)
 491                 tz = altzone;
 492 #else /* !defined (HAVE_ALTZONE) */
 493                 tz = (timezone - 3600);
 494 #endif
 495         } else {
 496                 tz = timezone;
 497         }
 498 #elif defined (HAVE__TIMEZONE)
 499         tz = _timezone;
 500 #else
 501 #error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc.
 502 #endif
 503
 504         return tt - tz;
 505 }
 506
 507 static time_t
 508 parse_rfc822_date (date_token *tokens, int *tzone)
 509 {
 510         int hour, min, sec, offset, n;
 511         date_token *token;
 512         struct tm tm;
 513         time_t t;
 514
 515         g_return_val_if_fail (tokens != NULL, (time_t) 0);
 516
 517         token = tokens;
 518
 519         memset ((void *) &tm, 0, sizeof (struct tm));
 520
 521         if ((n = get_wday (token->start, token->len)) != -1) {
 522                 /* not all dates may have this... */
 523                 tm.tm_wday = n;
 524                 token = token->next;
 525         }
 526
 527         /* get the mday */
 528         if (!token || (n = get_mday (token->start, token->len)) == -1)
 529                 return (time_t) 0;
 530
 531         tm.tm_mday = n;
 532         token = token->next;
 533
 534         /* get the month */
 535         if (!token || (n = get_month (token->start, token->len)) == -1)
 536                 return (time_t) 0;
 537
 538         tm.tm_mon = n;
 539         token = token->next;
 540
 541         /* get the year */
 542         if (!token || (n = get_year (token->start, token->len)) == -1)
 543                 return (time_t) 0;
 544
 545         tm.tm_year = n - 1900;
 546         token = token->next;
 547
 548         /* get the hour/min/sec */
 549         if (!token || !get_time (token->start, token->len, &hour, &min, &sec))
 550                 return (time_t) 0;
 551
 552         tm.tm_hour = hour;
 553         tm.tm_min = min;
 554         tm.tm_sec = sec;
 555         token = token->next;
 556
 557         /* get the timezone */
 558         if (!token || (n = get_tzone (&token)) == -1) {
 559                 /* I guess we assume tz is GMT? */
 560                 offset = 0;
 561         } else {
 562                 offset = n;
 563         }
 564
 565         t = mktime_utc (&tm);
 566
 567         /* t is now GMT of the time we want, but not offset by the timezone ... */
 568
 569         /* this should convert the time to the GMT equiv time */
 570         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 571
 572         if (tzone)
 573                 *tzone = offset;
 574
 575         return t;
 576 }
 577
 578
 579 #define date_token_mask(t)  (((date_token *) t)->mask)
 580 #define is_numeric(t)       ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0)
 581 #define is_weekday(t)       ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0)
 582 #define is_month(t)         ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0)
 583 #define is_time(t)          (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON))
 584 #define is_tzone_alpha(t)   ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0)
 585 #define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN))
 586 #define is_tzone(t)         (is_tzone_alpha (t) || is_tzone_numeric (t))
 587
 588 static time_t
 589 parse_broken_date (date_token *tokens, int *tzone)
 590 {
 591         gboolean got_wday, got_month, got_tzone;
 592         int hour, min, sec, offset, n;
 593         date_token *token;
 594         struct tm tm;
 595         time_t t;
 596
 597         memset ((void *) &tm, 0, sizeof (struct tm));
 598         got_wday = got_month = got_tzone = FALSE;
 599         offset = 0;
 600
 601         token = tokens;
 602         while (token) {
 603                 if (is_weekday (token) && !got_wday) {
 604                         if ((n = get_wday (token->start, token->len)) != -1) {
 605                                 d(printf ("weekday; "));
 606                                 got_wday = TRUE;
 607                                 tm.tm_wday = n;
 608                                 goto next;
 609                         }
 610                 }
 611
 612                 if (is_month (token) && !got_month) {
 613                         if ((n = get_month (token->start, token->len)) != -1) {
 614                                 d(printf ("month; "));
 615                                 got_month = TRUE;
 616                                 tm.tm_mon = n;
 617                                 goto next;
 618                         }
 619                 }
 620
 621                 if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) {
 622                         if (get_time (token->start, token->len, &hour, &min, &sec)) {
 623                                 d(printf ("time; "));
 624                                 tm.tm_hour = hour;
 625                                 tm.tm_min = min;
 626                                 tm.tm_sec = sec;
 627                                 goto next;
 628                         }
 629                 }
 630
 631                 if (is_tzone (token) && !got_tzone) {
 632                         date_token *t = token;
 633
 634                         if ((n = get_tzone (&t)) != -1) {
 635                                 d(printf ("tzone; "));
 636                                 got_tzone = TRUE;
 637                                 offset = n;
 638                                 goto next;
 639                         }
 640                 }
 641
 642                 if (is_numeric (token)) {
 643                         if (token->len == 4 && !tm.tm_year) {
 644                                 if ((n = get_year (token->start, token->len)) != -1) {
 645                                         d(printf ("year; "));
 646                                         tm.tm_year = n - 1900;
 647                                         goto next;
 648                                 }
 649                         } else {
 650                                 /* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */
 651                                 if (!got_month && token->next && is_numeric (token->next)) {
 652                                         if ((n = decode_int (token->start, token->len)) > 12) {
 653                                                 goto mday;
 654                                         } else if (n > 0) {
 655                                                 d(printf ("mon; "));
 656                                                 got_month = TRUE;
 657                                                 tm.tm_mon = n - 1;
 658                                         }
 659                                         goto next;
 660                                 } else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) {
 661                                 mday:
 662                                         d(printf ("mday; "));
 663                                         tm.tm_mday = n;
 664                                         goto next;
 665                                 } else if (!tm.tm_year) {
 666                                         if ((n = get_year (token->start, token->len)) != -1) {
 667                                                 d(printf ("2-digit year; "));
 668                                                 tm.tm_year = n - 1900;
 669                                         }
 670                                         goto next;
 671                                 }
 672                         }
 673                 }
 674
 675                 d(printf ("???; "));
 676
 677         next:
 678
 679                 token = token->next;
 680         }
 681
 682         d(printf ("\n"));
 683
 684         t = mktime_utc (&tm);
 685
 686         /* t is now GMT of the time we want, but not offset by the timezone ... */
 687
 688         /* this should convert the time to the GMT equiv time */
 689         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 690
 691         if (tzone)
 692                 *tzone = offset;
 693
 694         return t;
 695 }
 696
 697 #if 0
 698 static void
 699 gmime_datetok_table_init (void)
 700 {
 701         int i;
 702
 703         memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table));
 704
 705         for (i = 0; i < 256; i++) {
 706                 if (!strchr (NUMERIC_CHARS, i))
 707                         gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC;
 708
 709                 if (!strchr (WEEKDAY_CHARS, i))
 710                         gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY;
 711
 712                 if (!strchr (MONTH_CHARS, i))
 713                         gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH;
 714
 715                 if (!strchr (TIME_CHARS, i))
 716                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME;
 717
 718                 if (!strchr (TIMEZONE_ALPHA_CHARS, i))
 719                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA;
 720
 721                 if (!strchr (TIMEZONE_NUMERIC_CHARS, i))
 722                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC;
 723
 724                 if (((char) i) == ':')
 725                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON;
 726
 727                 if (strchr ("+-", i))
 728                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN;
 729         }
 730
 731         printf ("static unsigned char gmime_datetok_table[256] = {");
 732         for (i = 0; i < 256; i++) {
 733                 if (i % 16 == 0)
 734                         printf ("\n\t");
 735                 printf ("%3d,", gmime_datetok_table[i]);
 736         }
 737         printf ("\n};\n");
 738 }
 739 #endif
 740
 741
 742 /**
 743  * g_mime_utils_header_decode_date:
 744  * @str: input date string
 745  * @tz_offset: timezone offset
 746  *
 747  * Decodes the rfc822 date string and saves the GMT offset into
 748  * @tz_offset if non-NULL.
 749  *
 750  * Returns: the time_t representation of the date string specified by
 751  * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value
 752  * of the timezone offset will be stored.
 753  **/
 754 time_t
 755 g_mime_utils_header_decode_date (const char *str, int *tz_offset)
 756 {
 757         date_token *token, *tokens;
 758         time_t date;
 759
 760         if (!(tokens = datetok (str))) {
 761                 if (tz_offset)
 762                         *tz_offset = 0;
 763
 764                 return (time_t) 0;
 765         }
 766
 767         if (!(date = parse_rfc822_date (tokens, tz_offset)))
 768                 date = parse_broken_date (tokens, tz_offset);
 769
 770         /* cleanup */
 771         while (tokens) {
 772                 token = tokens;
 773                 tokens = tokens->next;
 774                 date_token_free (token);
 775         }
 776
 777         return date;
 778 }
 779
 780
 781 /**
 782  * g_mime_utils_generate_message_id:
 783  * @fqdn: Fully qualified domain name
 784  *
 785  * Generates a unique Message-Id.
 786  *
 787  * Returns: a unique string in an addr-spec format suitable for use as
 788  * a Message-Id.
 789  **/
 790 char *
 791 g_mime_utils_generate_message_id (const char *fqdn)
 792 {
 793 #ifdef G_THREADS_ENABLED
 794         static GStaticMutex mutex = G_STATIC_MUTEX_INIT;
 795 #define MUTEX_LOCK()   g_static_mutex_lock (&mutex)
 796 #define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex)
 797 #else
 798 #define MUTEX_LOCK()
 799 #define MUTEX_UNLOCK()
 800 #endif
 801         static unsigned long int count = 0;
 802         const char *hostname = NULL;
 803         char *name = NULL;
 804         char *msgid;
 805
 806         if (!fqdn) {
 807 #ifdef HAVE_UTSNAME_DOMAINNAME
 808                 struct utsname unam;
 809
 810                 uname (&unam);
 811
 812                 hostname = unam.nodename;
 813
 814                 if (unam.domainname[0])
 815                         name = g_strdup_printf ("%s.%s", hostname, unam.domainname);
 816 #else /* ! HAVE_UTSNAME_DOMAINNAME */
 817                 char host[MAXHOSTNAMELEN + 1];
 818
 819 #ifdef HAVE_GETHOSTNAME
 820                 host[MAXHOSTNAMELEN] = '\0';
 821                 if (gethostname (host, MAXHOSTNAMELEN) == 0) {
 822 #ifdef HAVE_GETDOMAINNAME
 823                         size_t domainlen = MAXHOSTNAMELEN;
 824                         char *domain;
 825                         int rv;
 826
 827                         domain = g_malloc (domainlen);
 828
 829                         while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) {
 830                                 domainlen += MAXHOSTNAMELEN;
 831                                 domain = g_realloc (domain, domainlen);
 832                         }
 833
 834                         if (rv == 0 && domain[0]) {
 835                                 if (host[0]) {
 836                                         name = g_strdup_printf ("%s.%s", host, domain);
 837                                         g_free (domain);
 838                                 } else {
 839                                         name = domain;
 840                                 }
 841                         }
 842 #endif /* HAVE_GETDOMAINNAME */
 843                 } else {
 844                         host[0] = '\0';
 845                 }
 846 #endif /* HAVE_GETHOSTNAME */
 847                 hostname = host;
 848 #endif /* HAVE_UTSNAME_DOMAINNAME */
 849
 850 #ifdef HAVE_GETADDRINFO
 851                 if (!name && hostname[0]) {
 852                         /* we weren't able to get a domain name */
 853                         struct addrinfo hints, *res;
 854
 855                         memset (&hints, 0, sizeof (hints));
 856                         hints.ai_flags = AI_CANONNAME;
 857
 858                         if (getaddrinfo (hostname, NULL, &hints, &res) == 0) {
 859                                 name = g_strdup (res->ai_canonname);
 860                                 freeaddrinfo (res);
 861                         }
 862                 }
 863 #endif /* HAVE_GETADDRINFO */
 864
 865                 fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain");
 866         }
 867
 868         MUTEX_LOCK ();
 869         msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL),
 870                                  (unsigned long int) getpid (), count++, fqdn);
 871         MUTEX_UNLOCK ();
 872
 873         g_free (name);
 874
 875         return msgid;
 876 }
 877
 878 static char *
 879 decode_addrspec (const char **in)
 880 {
 881         const char *word, *inptr;
 882         GString *addrspec;
 883         char *str;
 884
 885         decode_lwsp (in);
 886         inptr = *in;
 887
 888         if (!(word = decode_word (&inptr))) {
 889                 w(g_warning ("No local-part in addr-spec: %s", *in));
 890                 return NULL;
 891         }
 892
 893         addrspec = g_string_new ("");
 894         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 895
 896         /* get the rest of the local-part */
 897         decode_lwsp (&inptr);
 898         while (*inptr == '.') {
 899                 g_string_append_c (addrspec, *inptr++);
 900                 if ((word = decode_word (&inptr))) {
 901                         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 902                         decode_lwsp (&inptr);
 903                 } else {
 904                         w(g_warning ("Invalid local-part in addr-spec: %s", *in));
 905                         goto exception;
 906                 }
 907         }
 908
 909         /* we should be at the '@' now... */
 910         if (*inptr++ != '@') {
 911                 w(g_warning ("Invalid addr-spec; missing '@': %s", *in));
 912                 goto exception;
 913         }
 914
 915         g_string_append_c (addrspec, '@');
 916         if (!decode_domain (&inptr, addrspec)) {
 917                 w(g_warning ("No domain in addr-spec: %s", *in));
 918                 goto exception;
 919         }
 920
 921         str = addrspec->str;
 922         g_string_free (addrspec, FALSE);
 923
 924         *in = inptr;
 925
 926         return str;
 927
 928  exception:
 929
 930         g_string_free (addrspec, TRUE);
 931
 932         return NULL;
 933 }
 934
 935 static char *
 936 decode_msgid (const char **in)
 937 {
 938         const char *inptr = *in;
 939         char *msgid = NULL;
 940
 941         decode_lwsp (&inptr);
 942         if (*inptr != '<') {
 943                 w(g_warning ("Invalid msg-id; missing '<': %s", *in));
 944         } else {
 945                 inptr++;
 946         }
 947
 948         decode_lwsp (&inptr);
 949         if ((msgid = decode_addrspec (&inptr))) {
 950                 decode_lwsp (&inptr);
 951                 if (*inptr != '>') {
 952                         w(g_warning ("Invalid msg-id; missing '>': %s", *in));
 953                 } else {
 954                         inptr++;
 955                 }
 956
 957                 *in = inptr;
 958         } else {
 959                 w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in));
 960                 *in = inptr;
 961                 while (*inptr && *inptr != '>')
 962                         inptr++;
 963
 964                 msgid = g_strndup (*in, (size_t) (inptr - *in));
 965                 *in = inptr;
 966         }
 967
 968         return msgid;
 969 }
 970
 971
 972 /**
 973  * g_mime_utils_decode_message_id:
 974  * @message_id: string containing a message-id
 975  *
 976  * Decodes a msg-id as defined by rfc822.
 977  *
 978  * Returns: the addr-spec portion of the msg-id.
 979  **/
 980 char *
 981 g_mime_utils_decode_message_id (const char *message_id)
 982 {
 983         g_return_val_if_fail (message_id != NULL, NULL);
 984
 985         return decode_msgid (&message_id);
 986 }
 987
 988
 989 /**
 990  * g_mime_references_decode:
 991  * @text: string containing a list of msg-ids
 992  *
 993  * Decodes a list of msg-ids as in the References and/or In-Reply-To
 994  * headers defined in rfc822.
 995  *
 996  * Returns: a list of referenced msg-ids.
 997  **/
 998 GMimeReferences *
 999 g_mime_references_decode (const char *text)
1000 {
1001         GMimeReferences refs, *tail, *ref;
1002         const char *word, *inptr = text;
1003         char *msgid;
1004
1005         g_return_val_if_fail (text != NULL, NULL);
1006
1007         tail = (GMimeReferences *) &refs;
1008         refs.next = NULL;
1009
1010         while (*inptr) {
1011                 decode_lwsp (&inptr);
1012                 if (*inptr == '<') {
1013                         /* looks like a msg-id */
1014                         if ((msgid = decode_msgid (&inptr))) {
1015                                 ref = g_new (GMimeReferences, 1);
1016                                 ref->next = NULL;
1017                                 ref->msgid = msgid;
1018                                 tail->next = ref;
1019                                 tail = ref;
1020                         } else {
1021                                 w(g_warning ("Invalid References header: %s", inptr));
1022                                 break;
1023                         }
1024                 } else if (*inptr) {
1025                         /* looks like part of a phrase */
1026                         if (!(word = decode_word (&inptr))) {
1027                                 w(g_warning ("Invalid References header: %s", inptr));
1028                                 break;
1029                         }
1030                 }
1031         }
1032
1033         return refs.next;
1034 }
1035
1036
1037 /**
1038  * g_mime_references_append:
1039  * @refs: the address of a #GMimeReferences list
1040  * @msgid: a message-id string
1041  *
1042  * Appends a reference to msgid to the list of references.
1043  **/
1044 void
1045 g_mime_references_append (GMimeReferences **refs, const char *msgid)
1046 {
1047         GMimeReferences *ref;
1048
1049         g_return_if_fail (refs != NULL);
1050         g_return_if_fail (msgid != NULL);
1051
1052         ref = (GMimeReferences *) refs;
1053         while (ref->next)
1054                 ref = ref->next;
1055
1056         ref->next = g_new (GMimeReferences, 1);
1057         ref->next->msgid = g_strdup (msgid);
1058         ref->next->next = NULL;
1059 }
1060
1061
1062 /**
1063  * g_mime_references_free:
1064  * @refs: a #GMimeReferences list
1065  *
1066  * Frees the #GMimeReferences list.
1067  **/
1068 void
1069 g_mime_references_free (GMimeReferences *refs)
1070 {
1071         GMimeReferences *ref, *next;
1072
1073         ref = refs;
1074         while (ref) {
1075                 next = ref->next;
1076                 g_free (ref->msgid);
1077                 g_free (ref);
1078                 ref = next;
1079         }
1080 }
1081
1082
1083 /**
1084  * g_mime_references_clear:
1085  * @refs: address of a #GMimeReferences list
1086  *
1087  * Clears the #GMimeReferences list and resets it to %NULL.
1088  **/
1089 void
1090 g_mime_references_clear (GMimeReferences **refs)
1091 {
1092         g_return_if_fail (refs != NULL);
1093
1094         g_mime_references_free (*refs);
1095         *refs = NULL;
1096 }
1097
1098
1099 /**
1100  * g_mime_references_get_next:
1101  * @ref: a #GMimeReferences list
1102  *
1103  * Advances to the next reference node in the #GMimeReferences list.
1104  *
1105  * Returns: the next reference node in the #GMimeReferences list.
1106  **/
1107 const GMimeReferences *
1108 g_mime_references_get_next (const GMimeReferences *ref)
1109 {
1110         return ref ? ref->next : NULL;
1111 }
1112
1113
1114 /**
1115  * g_mime_references_get_message_id:
1116  * @ref: a #GMimeReferences list
1117  *
1118  * Gets the Message-Id reference from the #GMimeReferences node.
1119  *
1120  * Returns: the Message-Id reference from the #GMimeReferences node.
1121  **/
1122 const char *
1123 g_mime_references_get_message_id (const GMimeReferences *ref)
1124 {
1125         return ref ? ref->msgid : NULL;
1126 }
1127
1128
1129 static gboolean
1130 is_rfc2047_token (const char *inptr, size_t len)
1131 {
1132         if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0)
1133                 return FALSE;
1134
1135         inptr += 2;
1136         len -= 2;
1137
1138         /* skip past the charset */
1139         while (*inptr != '?' && len > 0) {
1140                 inptr++;
1141                 len--;
1142         }
1143
1144         if (*inptr != '?' || len < 4)
1145                 return FALSE;
1146
1147         if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B')
1148                 return FALSE;
1149
1150         inptr += 2;
1151         len -= 2;
1152
1153         if (*inptr != '?')
1154                 return FALSE;
1155
1156         return TRUE;
1157 }
1158
1159 static char *
1160 header_fold (const char *in, gboolean structured)
1161 {
1162         gboolean last_was_lwsp = FALSE;
1163         register const char *inptr;
1164         size_t len, outlen, i;
1165         size_t fieldlen;
1166         GString *out;
1167         char *ret;
1168
1169         inptr = in;
1170         len = strlen (in);
1171         if (len <= GMIME_FOLD_LEN + 1)
1172                 return g_strdup (in);
1173
1174         out = g_string_new ("");
1175         fieldlen = strcspn (inptr, ": \t\n");
1176         g_string_append_len (out, inptr, fieldlen);
1177         outlen = fieldlen;
1178         inptr += fieldlen;
1179
1180         while (*inptr && *inptr != '\n') {
1181                 len = strcspn (inptr, " \t\n");
1182
1183                 if (len > 1 && outlen + len > GMIME_FOLD_LEN) {
1184                         if (outlen > 1 && out->len >= fieldlen + 2) {
1185                                 if (last_was_lwsp) {
1186                                         if (structured)
1187                                                 out->str[out->len - 1] = '\t';
1188
1189                                         g_string_insert_c (out, out->len - 1, '\n');
1190                                 } else
1191                                         g_string_append (out, "\n\t");
1192
1193                                 outlen = 1;
1194                         }
1195
1196                         if (!structured && !is_rfc2047_token (inptr, len)) {
1197                                 /* check for very long words, just cut them up */
1198                                 while (outlen + len > GMIME_FOLD_LEN) {
1199                                         for (i = 0; i < GMIME_FOLD_LEN - outlen; i++)
1200                                                 g_string_append_c (out, inptr[i]);
1201                                         inptr += GMIME_FOLD_LEN - outlen;
1202                                         len -= GMIME_FOLD_LEN - outlen;
1203                                         g_string_append (out, "\n\t");
1204                                         outlen = 1;
1205                                 }
1206                         } else {
1207                                 g_string_append_len (out, inptr, len);
1208                                 outlen += len;
1209                                 inptr += len;
1210                         }
1211                         last_was_lwsp = FALSE;
1212                 } else if (len > 0) {
1213                         g_string_append_len (out, inptr, len);
1214                         outlen += len;
1215                         inptr += len;
1216                         last_was_lwsp = FALSE;
1217                 } else {
1218                         last_was_lwsp = TRUE;
1219                         if (*inptr == '\t') {
1220                                 /* tabs are a good place to fold, odds
1221                                    are that this is where the previous
1222                                    mailer folded it */
1223                                 g_string_append (out, "\n\t");
1224                                 outlen = 1;
1225                                 while (is_blank (*inptr))
1226                                         inptr++;
1227                         } else {
1228                                 g_string_append_c (out, *inptr++);
1229                                 outlen++;
1230                         }
1231                 }
1232         }
1233
1234         if (*inptr == '\n' && out->str[out->len - 1] != '\n')
1235                 g_string_append_c (out, '\n');
1236
1237         ret = out->str;
1238         g_string_free (out, FALSE);
1239
1240         return ret;
1241 }
1242
1243
1244 /**
1245  * g_mime_utils_structured_header_fold:
1246  * @str: input string
1247  *
1248  * Folds a structured header according to the rules in rfc822.
1249  *
1250  * Returns: an allocated string containing the folded header.
1251  **/
1252 char *
1253 g_mime_utils_structured_header_fold (const char *str)
1254 {
1255         return header_fold (str, TRUE);
1256 }
1257
1258
1259 /**
1260  * g_mime_utils_unstructured_header_fold:
1261  * @str: input string
1262  *
1263  * Folds an unstructured header according to the rules in rfc822.
1264  *
1265  * Returns: an allocated string containing the folded header.
1266  **/
1267 char *
1268 g_mime_utils_unstructured_header_fold (const char *str)
1269 {
1270         return header_fold (str, FALSE);
1271 }
1272
1273
1274 /**
1275  * g_mime_utils_header_fold:
1276  * @str: input string
1277  *
1278  * Folds a structured header according to the rules in rfc822.
1279  *
1280  * Returns: an allocated string containing the folded header.
1281  **/
1282 char *
1283 g_mime_utils_header_fold (const char *str)
1284 {
1285         return header_fold (str, TRUE);
1286 }
1287
1288
1289 /**
1290  * g_mime_utils_header_printf:
1291  * @format: string format
1292  * @Varargs: arguments
1293  *
1294  * Allocates a buffer containing a formatted header specified by the
1295  * @Varargs.
1296  *
1297  * Returns: an allocated string containing the folded header specified
1298  * by @format and the following arguments.
1299  **/
1300 char *
1301 g_mime_utils_header_printf (const char *format, ...)
1302 {
1303         char *buf, *ret;
1304         va_list ap;
1305
1306         va_start (ap, format);
1307         buf = g_strdup_vprintf (format, ap);
1308         va_end (ap);
1309
1310         ret = header_fold (buf, TRUE);
1311         g_free (buf);
1312
1313         return ret;
1314 }
1315
1316 static gboolean
1317 need_quotes (const char *string)
1318 {
1319         gboolean quoted = FALSE;
1320         const char *inptr;
1321
1322         inptr = string;
1323
1324         while (*inptr) {
1325                 if (*inptr == '\\')
1326                         inptr++;
1327                 else if (*inptr == '"')
1328                         quoted = !quoted;
1329                 else if (!quoted && (is_tspecial (*inptr) || *inptr == '.'))
1330                         return TRUE;
1331
1332                 if (*inptr)
1333                         inptr++;
1334         }
1335
1336         return FALSE;
1337 }
1338
1339 /**
1340  * g_mime_utils_quote_string:
1341  * @str: input string
1342  *
1343  * Quotes @string as needed according to the rules in rfc2045.
1344  *
1345  * Returns: an allocated string containing the escaped and quoted (if
1346  * needed to be) input string. The decision to quote the string is
1347  * based on whether or not the input string contains any 'tspecials'
1348  * as defined by rfc2045.
1349  **/
1350 char *
1351 g_mime_utils_quote_string (const char *str)
1352 {
1353         gboolean quote;
1354         const char *c;
1355         char *qstring;
1356         GString *out;
1357
1358         out = g_string_new ("");
1359
1360         if ((quote = need_quotes (str)))
1361                 g_string_append_c (out, '"');
1362
1363         for (c = str; *c; c++) {
1364                 if ((*c == '"' && quote) || *c == '\\')
1365                         g_string_append_c (out, '\\');
1366
1367                 g_string_append_c (out, *c);
1368         }
1369
1370         if (quote)
1371                 g_string_append_c (out, '"');
1372
1373         qstring = out->str;
1374         g_string_free (out, FALSE);
1375
1376         return qstring;
1377 }
1378
1379
1380 /**
1381  * g_mime_utils_unquote_string:
1382  * @str: input string
1383  *
1384  * Unquotes and unescapes a string.
1385  **/
1386 void
1387 g_mime_utils_unquote_string (char *str)
1388 {
1389         /* if the string is quoted, unquote it */
1390         register char *inptr = str;
1391         int escaped = FALSE;
1392         int quoted = FALSE;
1393
1394         if (!str)
1395                 return;
1396
1397         while (*inptr) {
1398                 if (*inptr == '\\') {
1399                         if (escaped)
1400                                 *str++ = *inptr++;
1401                         else
1402                                 inptr++;
1403                         escaped = !escaped;
1404                 } else if (*inptr == '"') {
1405                         if (escaped) {
1406                                 *str++ = *inptr++;
1407                                 escaped = FALSE;
1408                         } else {
1409                                 quoted = !quoted;
1410                                 inptr++;
1411                         }
1412                 } else {
1413                         *str++ = *inptr++;
1414                         escaped = FALSE;
1415                 }
1416         }
1417
1418         *str = '\0';
1419 }
1420
1421
1422 /**
1423  * g_mime_utils_text_is_8bit:
1424  * @text: text to check for 8bit chars
1425  * @len: text length
1426  *
1427  * Determines if @text contains 8bit characters within the first @len
1428  * bytes.
1429  *
1430  * Returns: %TRUE if the text contains 8bit characters or %FALSE
1431  * otherwise.
1432  **/
1433 gboolean
1434 g_mime_utils_text_is_8bit (const unsigned char *text, size_t len)
1435 {
1436         register const unsigned char *inptr;
1437         const unsigned char *inend;
1438
1439         g_return_val_if_fail (text != NULL, FALSE);
1440
1441         inend = text + len;
1442         for (inptr = text; *inptr && inptr < inend; inptr++)
1443                 if (*inptr > (unsigned char) 127)
1444                         return TRUE;
1445
1446         return FALSE;
1447 }
1448
1449
1450 /**
1451  * g_mime_utils_best_encoding:
1452  * @text: text to encode
1453  * @len: text length
1454  *
1455  * Determines the best content encoding for the first @len bytes of
1456  * @text.
1457  *
1458  * Returns: a #GMimeContentEncoding that is determined to be the best
1459  * encoding type for the specified block of text. ("best" in this
1460  * particular case means smallest output size)
1461  **/
1462 GMimeContentEncoding
1463 g_mime_utils_best_encoding (const unsigned char *text, size_t len)
1464 {
1465         const unsigned char *ch, *inend;
1466         size_t count = 0;
1467
1468         inend = text + len;
1469         for (ch = text; ch < inend; ch++)
1470                 if (*ch > (unsigned char) 127)
1471                         count++;
1472
1473         if ((float) count <= len * 0.17)
1474                 return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE;
1475         else
1476                 return GMIME_CONTENT_ENCODING_BASE64;
1477 }
1478
1479
1480 /**
1481  * charset_convert:
1482  * @cd: iconv converter
1483  * @inbuf: input text buffer to convert
1484  * @inleft: length of the input buffer
1485  * @outp: pointer to output buffer
1486  * @outlenp: pointer to output buffer length
1487  * @ninval: the number of invalid bytes in @inbuf
1488  *
1489  * Converts the input buffer from one charset to another using the
1490  * @cd. On completion, @outp will point to the output buffer
1491  * containing the converted text (nul-terminated), @outlenp will be
1492  * the size of the @outp buffer (note: not the strlen() of @outp) and
1493  * @ninval will contain the number of bytes which could not be
1494  * converted.
1495  *
1496  * Bytes which cannot be converted from @inbuf will appear as '?'
1497  * characters in the output buffer.
1498  *
1499  * If *@outp is non-NULL, then it is assumed that it points to a
1500  * pre-allocated buffer of length *@outlenp. This is done so that the
1501  * same output buffer can be reused multiple times.
1502  *
1503  * Returns: the string length of the output buffer.
1504  **/
1505 static size_t
1506 charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval)
1507 {
1508         size_t outlen, outleft, rc, n = 0;
1509         char *outbuf, *out;
1510
1511         if (*outp == NULL) {
1512                 outleft = outlen = (inleft * 2) + 16;
1513                 outbuf = out = g_malloc (outlen + 1);
1514         } else {
1515                 outleft = outlen = *outlenp;
1516                 outbuf = out = *outp;
1517         }
1518
1519         do {
1520                 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
1521                 if (rc == (size_t) -1) {
1522                         if (errno == EINVAL) {
1523                                 /* incomplete sequence at the end of the input buffer */
1524                                 n += inleft;
1525                                 break;
1526                         }
1527
1528 #ifdef G_OS_WIN32
1529                         /* seems that GnuWin32's libiconv 1.9 does not set errno in
1530                          * the E2BIG case, so we have to fake it */
1531                         if (outleft <= inleft)
1532                                 errno = E2BIG;
1533 #endif
1534
1535                         if (errno == E2BIG || outleft == 0) {
1536                                 /* need to grow the output buffer */
1537                                 outlen += (inleft * 2) + 16;
1538                                 rc = (size_t) (outbuf - out);
1539                                 out = g_realloc (out, outlen + 1);
1540                                 outleft = outlen - rc;
1541                                 outbuf = out + rc;
1542                         }
1543
1544                         /* Note: GnuWin32's libiconv 1.9 can also set errno to ERANGE
1545                          * which seems to mean that it encountered a character that
1546                          * does not fit the specified 'from' charset. We'll handle
1547                          * that the same way we handle EILSEQ. */
1548                         if (errno == EILSEQ || errno == ERANGE) {
1549                                 /* invalid or incomplete multibyte
1550                                  * sequence in the input buffer */
1551                                 *outbuf++ = '?';
1552                                 outleft--;
1553                                 inleft--;
1554                                 inbuf++;
1555                                 n++;
1556                         }
1557                 }
1558         } while (inleft > 0);
1559
1560         while (iconv (cd, NULL, NULL, &outbuf, &outleft) == (size_t) -1) {
1561                 if (errno != E2BIG)
1562                         break;
1563
1564                 outlen += 16;
1565                 rc = (size_t) (outbuf - out);
1566                 out = g_realloc (out, outlen + 1);
1567                 outleft = outlen - rc;
1568                 outbuf = out + rc;
1569         }
1570
1571         *outbuf = '\0';
1572
1573         *outlenp = outlen;
1574         *outp = out;
1575         *ninval = n;
1576
1577         return (outbuf - out);
1578 }
1579
1580
1581 #define USER_CHARSETS_INCLUDE_UTF8    (1 << 0)
1582 #define USER_CHARSETS_INCLUDE_LOCALE  (1 << 1)
1583 #define USER_CHARSETS_INCLUDE_LATIN1  (1 << 2)
1584
1585
1586 /**
1587  * g_mime_utils_decode_8bit:
1588  * @text: input text in unknown 8bit/multibyte character set
1589  * @len: input text length
1590  *
1591  * Attempts to convert text in an unknown 8bit/multibyte charset into
1592  * UTF-8 by finding the charset which will convert the most bytes into
1593  * valid UTF-8 characters as possible. If no exact match can be found,
1594  * it will choose the best match and convert invalid byte sequences
1595  * into question-marks (?) in the returned string buffer.
1596  *
1597  * Returns: a UTF-8 string representation of @text.
1598  **/
1599 char *
1600 g_mime_utils_decode_8bit (const char *text, size_t len)
1601 {
1602         const char **charsets, **user_charsets, *locale, *best;
1603         size_t outleft, outlen, min, ninval;
1604         unsigned int included = 0;
1605         iconv_t cd;
1606         char *out;
1607         int i = 0;
1608
1609         g_return_val_if_fail (text != NULL, NULL);
1610
1611         locale = g_mime_locale_charset ();
1612         if (!g_ascii_strcasecmp (locale, "iso-8859-1") ||
1613             !g_ascii_strcasecmp (locale, "UTF-8")) {
1614                 /* If the user's locale charset is either of these, we
1615                  * don't need to include the locale charset in our list
1616                  * of fallback charsets. */
1617                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1618         }
1619
1620         if ((user_charsets = g_mime_user_charsets ())) {
1621                 while (user_charsets[i])
1622                         i++;
1623         }
1624
1625         charsets = g_alloca (sizeof (char *) * (i + 4));
1626         i = 0;
1627
1628         if (user_charsets) {
1629                 while (user_charsets[i]) {
1630                         /* keep a record of whether or not the user-supplied
1631                          * charsets include UTF-8, Latin1, or the user's locale
1632                          * charset so that we avoid doubling our efforts for
1633                          * these 3 charsets. We could have used a hash table
1634                          * to keep track of unique charsets, but we can
1635                          * (hopefully) assume that user_charsets is a unique
1636                          * list of charsets with no duplicates. */
1637                         if (!g_ascii_strcasecmp (user_charsets[i], "iso-8859-1"))
1638                                 included |= USER_CHARSETS_INCLUDE_LATIN1;
1639
1640                         if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8"))
1641                                 included |= USER_CHARSETS_INCLUDE_UTF8;
1642
1643                         if (!g_ascii_strcasecmp (user_charsets[i], locale))
1644                                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1645
1646                         charsets[i] = user_charsets[i];
1647                         i++;
1648                 }
1649         }
1650
1651         if (!(included & USER_CHARSETS_INCLUDE_UTF8))
1652                 charsets[i++] = "UTF-8";
1653
1654         if (!(included & USER_CHARSETS_INCLUDE_LOCALE))
1655                 charsets[i++] = locale;
1656
1657         if (!(included & USER_CHARSETS_INCLUDE_LATIN1))
1658                 charsets[i++] = "iso-8859-1";
1659
1660         charsets[i] = NULL;
1661
1662         min = len;
1663         best = charsets[0];
1664
1665         outleft = (len * 2) + 16;
1666         out = g_malloc (outleft + 1);
1667
1668         for (i = 0; charsets[i]; i++) {
1669                 if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
1670                         continue;
1671
1672                 outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1673
1674                 g_mime_iconv_close (cd);
1675
1676                 if (ninval == 0)
1677                         return g_realloc (out, outlen + 1);
1678
1679                 if (ninval < min) {
1680                         best = charsets[i];
1681                         min = ninval;
1682                 }
1683         }
1684
1685         /* if we get here, then none of the charsets fit the 8bit text flawlessly...
1686          * try to find the one that fit the best and use that to convert what we can,
1687          * replacing any byte we can't convert with a '?' */
1688
1689         if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
1690                 /* this shouldn't happen... but if we are here, then
1691                  * it did...  the only thing we can do at this point
1692                  * is replace the 8bit garbage and pray */
1693                 register const char *inptr = text;
1694                 const char *inend = inptr + len;
1695                 char *outbuf = out;
1696
1697                 while (inptr < inend) {
1698                         if (is_ascii (*inptr))
1699                                 *outbuf++ = *inptr;
1700                         else
1701                                 *outbuf++ = '?';
1702
1703                         inptr++;
1704                 }
1705
1706                 *outbuf++ = '\0';
1707
1708                 return g_realloc (out, (size_t) (outbuf - out));
1709         }
1710
1711         outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1712
1713         g_mime_iconv_close (cd);
1714
1715         return g_realloc (out, outlen + 1);
1716 }
1717
1718
1719 /* this decodes rfc2047's version of quoted-printable */
1720 static size_t
1721 quoted_decode (const unsigned char *in, size_t len, unsigned char *out, int *state, guint32 *save)
1722 {
1723         register const unsigned char *inptr;
1724         register unsigned char *outptr;
1725         const unsigned char *inend;
1726         unsigned char c, c1;
1727         guint32 saved;
1728         int need;
1729
1730         if (len == 0)
1731                 return 0;
1732
1733         inend = in + len;
1734         outptr = out;
1735         inptr = in;
1736
1737         need = *state;
1738         saved = *save;
1739
1740         if (need > 0) {
1741                 if (isxdigit ((int) *inptr)) {
1742                         if (need == 1) {
1743                                 c = g_ascii_toupper ((int) (saved & 0xff));
1744                                 c1 = g_ascii_toupper ((int) *inptr++);
1745                                 saved = 0;
1746                                 need = 0;
1747
1748                                 goto decode;
1749                         }
1750
1751                         saved = 0;
1752                         need = 0;
1753
1754                         goto equals;
1755                 }
1756
1757                 /* last encoded-word ended in a malformed quoted-printable sequence */
1758                 *outptr++ = '=';
1759
1760                 if (need == 1)
1761                         *outptr++ = (char) (saved & 0xff);
1762
1763                 saved = 0;
1764                 need = 0;
1765         }
1766
1767         while (inptr < inend) {
1768                 c = *inptr++;
1769                 if (c == '=') {
1770                 equals:
1771                         if (inend - inptr >= 2) {
1772                                 if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
1773                                         c = g_ascii_toupper (*inptr++);
1774                                         c1 = g_ascii_toupper (*inptr++);
1775                                 decode:
1776                                         *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
1777                                                 | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
1778                                 } else {
1779                                         /* malformed quoted-printable sequence? */
1780                                         *outptr++ = '=';
1781                                 }
1782                         } else {
1783                                 /* truncated payload, maybe it was split across encoded-words? */
1784                                 if (inptr < inend) {
1785                                         if (isxdigit ((int) *inptr)) {
1786                                                 saved = *inptr;
1787                                                 need = 1;
1788                                                 break;
1789                                         } else {
1790                                                 /* malformed quoted-printable sequence? */
1791                                                 *outptr++ = '=';
1792                                         }
1793                                 } else {
1794                                         saved = 0;
1795                                         need = 2;
1796                                         break;
1797                                 }
1798                         }
1799                 } else if (c == '_') {
1800                         /* _'s are an rfc2047 shortcut for encoding spaces */
1801                         *outptr++ = ' ';
1802                 } else {
1803                         *outptr++ = c;
1804                 }
1805         }
1806
1807         *state = need;
1808         *save = saved;
1809
1810         return (size_t) (outptr - out);
1811 }
1812
1813 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
1814
1815 typedef struct _rfc2047_token {
1816         struct _rfc2047_token *next;
1817         const char *charset;
1818         const char *text;
1819         size_t length;
1820         char encoding;
1821         char is_8bit;
1822 } rfc2047_token;
1823
1824 #define rfc2047_token_list_free(tokens) g_slice_free_chain (rfc2047_token, tokens, next)
1825 #define rfc2047_token_free(token) g_slice_free (rfc2047_token, token)
1826
1827 static rfc2047_token *
1828 rfc2047_token_new (const char *text, size_t len)
1829 {
1830         rfc2047_token *token;
1831
1832         token = g_slice_new0 (rfc2047_token);
1833         token->length = len;
1834         token->text = text;
1835
1836         return token;
1837 }
1838
1839 static rfc2047_token *
1840 rfc2047_token_new_encoded_word (const char *word, size_t len)
1841 {
1842         rfc2047_token *token;
1843         const char *payload;
1844         const char *charset;
1845         const char *inptr;
1846         char *buf, *lang;
1847         char encoding;
1848         size_t n;
1849
1850         /* check that this could even be an encoded-word token */
1851         if (len < 7 || strncmp (word, "=?", 2) != 0 || strncmp (word + len - 2, "?=", 2) != 0)
1852                 return NULL;
1853
1854         /* skip over '=?' */
1855         inptr = word + 2;
1856         charset = inptr;
1857
1858         if (*charset == '?' || *charset == '*') {
1859                 /* this would result in an empty charset */
1860                 return NULL;
1861         }
1862
1863         /* skip to the end of the charset */
1864         if (!(inptr = memchr (inptr, '?', len - 2)) || inptr[2] != '?')
1865                 return NULL;
1866
1867         /* copy the charset into a buffer */
1868         n = (size_t) (inptr - charset);
1869         buf = g_alloca (n + 1);
1870         memcpy (buf, charset, n);
1871         buf[n] = '\0';
1872         charset = buf;
1873
1874         /* rfc2231 updates rfc2047 encoded words...
1875          * The ABNF given in RFC 2047 for encoded-words is:
1876          *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
1877          * This specification changes this ABNF to:
1878          *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
1879          */
1880
1881         /* trim off the 'language' part if it's there... */
1882         if ((lang = strchr (charset, '*')))
1883                 *lang = '\0';
1884
1885         /* skip over the '?' */
1886         inptr++;
1887
1888         /* make sure the first char after the encoding is another '?' */
1889         if (inptr[1] != '?')
1890                 return NULL;
1891
1892         switch (*inptr++) {
1893         case 'B': case 'b':
1894                 encoding = 'B';
1895                 break;
1896         case 'Q': case 'q':
1897                 encoding = 'Q';
1898                 break;
1899         default:
1900                 return NULL;
1901         }
1902
1903         /* the payload begins right after the '?' */
1904         payload = inptr + 1;
1905
1906         /* find the end of the payload */
1907         inptr = word + len - 2;
1908
1909         /* make sure that we don't have something like: =?iso-8859-1?Q?= */
1910         if (payload > inptr)
1911                 return NULL;
1912
1913         token = rfc2047_token_new (payload, inptr - payload);
1914         token->charset = g_mime_charset_iconv_name (charset);
1915         token->encoding = encoding;
1916
1917         return token;
1918 }
1919
1920 static rfc2047_token *
1921 tokenize_rfc2047_phrase (const char *in, size_t *len)
1922 {
1923         gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
1924         rfc2047_token list, *lwsp, *token, *tail;
1925         register const char *inptr = in;
1926         gboolean encoded = FALSE;
1927         const char *text, *word;
1928         gboolean ascii;
1929         size_t n;
1930
1931         tail = (rfc2047_token *) &list;
1932         list.next = NULL;
1933         lwsp = NULL;
1934
1935         while (*inptr != '\0') {
1936                 text = inptr;
1937                 while (is_lwsp (*inptr))
1938                         inptr++;
1939
1940                 if (inptr > text)
1941                         lwsp = rfc2047_token_new (text, inptr - text);
1942                 else
1943                         lwsp = NULL;
1944
1945                 word = inptr;
1946                 ascii = TRUE;
1947                 if (is_atom (*inptr)) {
1948                         if (G_UNLIKELY (enable_rfc2047_workarounds)) {
1949                                 /* Make an extra effort to detect and
1950                                  * separate encoded-word tokens that
1951                                  * have been merged with other
1952                                  * words. */
1953
1954                                 if (!strncmp (inptr, "=?", 2)) {
1955                                         inptr += 2;
1956
1957                                         /* skip past the charset (if one is even declared, sigh) */
1958                                         while (*inptr && *inptr != '?') {
1959                                                 ascii = ascii && is_ascii (*inptr);
1960                                                 inptr++;
1961                                         }
1962
1963                                         /* sanity check encoding type */
1964                                         if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
1965                                                 goto non_rfc2047;
1966
1967                                         inptr += 3;
1968
1969                                         /* find the end of the rfc2047 encoded word token */
1970                                         while (*inptr && strncmp (inptr, "?=", 2) != 0) {
1971                                                 ascii = ascii && is_ascii (*inptr);
1972                                                 inptr++;
1973                                         }
1974
1975                                         if (*inptr == '\0') {
1976                                                 /* didn't find an end marker... */
1977                                                 inptr = word + 2;
1978                                                 ascii = TRUE;
1979
1980                                                 goto non_rfc2047;
1981                                         }
1982
1983                                         inptr += 2;
1984                                 } else {
1985                                 non_rfc2047:
1986                                         /* stop if we encounter a possible rfc2047 encoded
1987                                          * token even if it's inside another word, sigh. */
1988                                         while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
1989                                                 inptr++;
1990                                 }
1991                         } else {
1992                                 while (is_atom (*inptr))
1993                                         inptr++;
1994                         }
1995
1996                         n = (size_t) (inptr - word);
1997                         if ((token = rfc2047_token_new_encoded_word (word, n))) {
1998                                 /* rfc2047 states that you must ignore all
1999                                  * whitespace between encoded words */
2000                                 if (!encoded && lwsp != NULL) {
2001                                         tail->next = lwsp;
2002                                         tail = lwsp;
2003                                 } else if (lwsp != NULL) {
2004                                         rfc2047_token_free (lwsp);
2005                                 }
2006
2007                                 tail->next = token;
2008                                 tail = token;
2009
2010                                 encoded = TRUE;
2011                         } else {
2012                                 /* append the lwsp and atom tokens */
2013                                 if (lwsp != NULL) {
2014                                         tail->next = lwsp;
2015                                         tail = lwsp;
2016                                 }
2017
2018                                 token = rfc2047_token_new (word, n);
2019                                 token->is_8bit = ascii ? 0 : 1;
2020                                 tail->next = token;
2021                                 tail = token;
2022
2023                                 encoded = FALSE;
2024                         }
2025                 } else {
2026                         /* append the lwsp token */
2027                         if (lwsp != NULL) {
2028                                 tail->next = lwsp;
2029                                 tail = lwsp;
2030                         }
2031
2032                         ascii = TRUE;
2033                         while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
2034                                 ascii = ascii && is_ascii (*inptr);
2035                                 inptr++;
2036                         }
2037
2038                         n = (size_t) (inptr - word);
2039                         token = rfc2047_token_new (word, n);
2040                         token->is_8bit = ascii ? 0 : 1;
2041
2042                         tail->next = token;
2043                         tail = token;
2044
2045                         encoded = FALSE;
2046                 }
2047         }
2048
2049         *len = (size_t) (inptr - in);
2050
2051         return list.next;
2052 }
2053
2054 static rfc2047_token *
2055 tokenize_rfc2047_text (const char *in, size_t *len)
2056 {
2057         gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
2058         rfc2047_token list, *lwsp, *token, *tail;
2059         register const char *inptr = in;
2060         gboolean encoded = FALSE;
2061         const char *text, *word;
2062         gboolean ascii;
2063         size_t n;
2064
2065         tail = (rfc2047_token *) &list;
2066         list.next = NULL;
2067         lwsp = NULL;
2068
2069         while (*inptr != '\0') {
2070                 text = inptr;
2071                 while (is_lwsp (*inptr))
2072                         inptr++;
2073
2074                 if (inptr > text)
2075                         lwsp = rfc2047_token_new (text, inptr - text);
2076                 else
2077                         lwsp = NULL;
2078
2079                 if (*inptr != '\0') {
2080                         word = inptr;
2081                         ascii = TRUE;
2082
2083                         if (G_UNLIKELY (enable_rfc2047_workarounds)) {
2084                                 if (!strncmp (inptr, "=?", 2)) {
2085                                         inptr += 2;
2086
2087                                         /* skip past the charset (if one is even declared, sigh) */
2088                                         while (*inptr && *inptr != '?') {
2089                                                 ascii = ascii && is_ascii (*inptr);
2090                                                 inptr++;
2091                                         }
2092
2093                                         /* sanity check encoding type */
2094                                         if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
2095                                                 goto non_rfc2047;
2096
2097                                         inptr += 3;
2098
2099                                         /* find the end of the rfc2047 encoded word token */
2100                                         while (*inptr && strncmp (inptr, "?=", 2) != 0) {
2101                                                 ascii = ascii && is_ascii (*inptr);
2102                                                 inptr++;
2103                                         }
2104
2105                                         if (*inptr == '\0') {
2106                                                 /* didn't find an end marker... */
2107                                                 inptr = word + 2;
2108                                                 ascii = TRUE;
2109
2110                                                 goto non_rfc2047;
2111                                         }
2112
2113                                         inptr += 2;
2114                                 } else {
2115                                 non_rfc2047:
2116                                         /* stop if we encounter a possible rfc2047 encoded
2117                                          * token even if it's inside another word, sigh. */
2118                                         while (*inptr && !is_lwsp (*inptr) &&
2119                                                strncmp (inptr, "=?", 2) != 0) {
2120                                                 ascii = ascii && is_ascii (*inptr);
2121                                                 inptr++;
2122                                         }
2123                                 }
2124                         } else {
2125                                 while (*inptr && !is_lwsp (*inptr)) {
2126                                         ascii = ascii && is_ascii (*inptr);
2127                                         inptr++;
2128                                 }
2129                         }
2130
2131                         n = (size_t) (inptr - word);
2132                         if ((token = rfc2047_token_new_encoded_word (word, n))) {
2133                                 /* rfc2047 states that you must ignore all
2134                                  * whitespace between encoded words */
2135                                 if (!encoded && lwsp != NULL) {
2136                                         tail->next = lwsp;
2137                                         tail = lwsp;
2138                                 } else if (lwsp != NULL) {
2139                                         rfc2047_token_free (lwsp);
2140                                 }
2141
2142                                 tail->next = token;
2143                                 tail = token;
2144
2145                                 encoded = TRUE;
2146                         } else {
2147                                 /* append the lwsp and atom tokens */
2148                                 if (lwsp != NULL) {
2149                                         tail->next = lwsp;
2150                                         tail = lwsp;
2151                                 }
2152
2153                                 token = rfc2047_token_new (word, n);
2154                                 token->is_8bit = ascii ? 0 : 1;
2155                                 tail->next = token;
2156                                 tail = token;
2157
2158                                 encoded = FALSE;
2159                         }
2160                 } else {
2161                         if (lwsp != NULL) {
2162                                 /* appending trailing lwsp */
2163                                 tail->next = lwsp;
2164                                 tail = lwsp;
2165                         }
2166
2167                         break;
2168                 }
2169         }
2170
2171         *len = (size_t) (inptr - in);
2172
2173         return list.next;
2174 }
2175
2176 static size_t
2177 rfc2047_token_decode (rfc2047_token *token, unsigned char *outbuf, int *state, guint32 *save)
2178 {
2179         const unsigned char *inbuf = (const unsigned char *) token->text;
2180         size_t len = token->length;
2181
2182         if (token->encoding == 'B')
2183                 return g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
2184         else
2185                 return quoted_decode (inbuf, len, outbuf, state, save);
2186 }
2187
2188 static char *
2189 rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
2190 {
2191         rfc2047_token *token, *next;
2192         size_t outlen, ninval, len;
2193         unsigned char *outptr;
2194         const char *charset;
2195         GByteArray *outbuf;
2196         GString *decoded;
2197         char encoding;
2198         guint32 save;
2199         iconv_t cd;
2200         int state;
2201         char *str;
2202
2203         decoded = g_string_sized_new (buflen + 1);
2204         outbuf = g_byte_array_sized_new (76);
2205
2206         token = tokens;
2207         while (token != NULL) {
2208                 next = token->next;
2209
2210                 if (token->encoding) {
2211                         /* In order to work around broken mailers, we need to combine
2212                          * the raw decoded content of runs of identically encoded word
2213                          * tokens before converting into UTF-8. */
2214                         encoding = token->encoding;
2215                         charset = token->charset;
2216                         len = token->length;
2217                         state = 0;
2218                         save = 0;
2219
2220                         /* find the end of the run (and measure the buffer length we'll need) */
2221                         while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
2222                                 len += next->length;
2223                                 next = next->next;
2224                         }
2225
2226                         /* make sure our temporary output buffer is large enough... */
2227                         if (len > outbuf->len)
2228                                 g_byte_array_set_size (outbuf, len);
2229
2230                         /* base64 / quoted-printable decode each of the tokens... */
2231                         outptr = outbuf->data;
2232                         outlen = 0;
2233                         do {
2234                                 /* Note: by not resetting state/save each loop, we effectively
2235                                  * treat the payloads as one continuous block, thus allowing
2236                                  * us to handle cases where a hex-encoded triplet of a
2237                                  * quoted-printable encoded payload is split between 2 or more
2238                                  * encoded-word tokens. */
2239                                 len = rfc2047_token_decode (token, outptr, &state, &save);
2240                                 token = token->next;
2241                                 outptr += len;
2242                                 outlen += len;
2243                         } while (token != next);
2244                         outptr = outbuf->data;
2245
2246                         /* convert the raw decoded text into UTF-8 */
2247                         if (!g_ascii_strcasecmp (charset, "UTF-8")) {
2248                                 /* slight optimization over going thru iconv */
2249                                 str = (char *) outptr;
2250                                 len = outlen;
2251
2252                                 while (!g_utf8_validate (str, len, (const char **) &str)) {
2253                                         len = outlen - (str - (char *) outptr);
2254                                         *str = '?';
2255                                 }
2256
2257                                 g_string_append_len (decoded, (char *) outptr, outlen);
2258                         } else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
2259                                 w(g_warning ("Cannot convert from %s to UTF-8, header display may "
2260                                              "be corrupt: %s", charset[0] ? charset : "unspecified charset",
2261                                              g_strerror (errno)));
2262
2263                                 str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
2264                                 g_string_append (decoded, str);
2265                                 g_free (str);
2266                         } else {
2267                                 str = g_malloc (outlen + 1);
2268                                 len = outlen;
2269
2270                                 len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
2271                                 g_mime_iconv_close (cd);
2272
2273                                 g_string_append_len (decoded, str, len);
2274                                 g_free (str);
2275
2276 #if w(!)0
2277                                 if (ninval > 0) {
2278                                         g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
2279                                                    "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
2280                                 }
2281 #endif
2282                         }
2283                 } else if (token->is_8bit) {
2284                         /* *sigh* I hate broken mailers... */
2285                         str = g_mime_utils_decode_8bit (token->text, token->length);
2286                         g_string_append (decoded, str);
2287                         g_free (str);
2288                 } else {
2289                         g_string_append_len (decoded, token->text, token->length);
2290                 }
2291
2292                 token = next;
2293         }
2294
2295         g_byte_array_free (outbuf, TRUE);
2296
2297         return g_string_free (decoded, FALSE);
2298 }
2299
2300
2301 /**
2302  * g_mime_utils_header_decode_text:
2303  * @text: header text to decode
2304  *
2305  * Decodes an rfc2047 encoded 'text' header.
2306  *
2307  * Note: See g_mime_set_user_charsets() for details on how charset
2308  * conversion is handled for unencoded 8bit text and/or wrongly
2309  * specified rfc2047 encoded-word tokens.
2310  *
2311  * Returns: a newly allocated UTF-8 string representing the the decoded
2312  * header.
2313  **/
2314 char *
2315 g_mime_utils_header_decode_text (const char *text)
2316 {
2317         rfc2047_token *tokens;
2318         char *decoded;
2319         size_t len;
2320
2321         if (text == NULL)
2322                 return g_strdup ("");
2323
2324         tokens = tokenize_rfc2047_text (text, &len);
2325         decoded = rfc2047_decode_tokens (tokens, len);
2326         rfc2047_token_list_free (tokens);
2327
2328         return decoded;
2329 }
2330
2331
2332 /**
2333  * g_mime_utils_header_decode_phrase:
2334  * @phrase: header to decode
2335  *
2336  * Decodes an rfc2047 encoded 'phrase' header.
2337  *
2338  * Note: See g_mime_set_user_charsets() for details on how charset
2339  * conversion is handled for unencoded 8bit text and/or wrongly
2340  * specified rfc2047 encoded-word tokens.
2341  *
2342  * Returns: a newly allocated UTF-8 string representing the the decoded
2343  * header.
2344  **/
2345 char *
2346 g_mime_utils_header_decode_phrase (const char *phrase)
2347 {
2348         rfc2047_token *tokens;
2349         char *decoded;
2350         size_t len;
2351
2352         if (phrase == NULL)
2353                 return g_strdup ("");
2354
2355         tokens = tokenize_rfc2047_phrase (phrase, &len);
2356         decoded = rfc2047_decode_tokens (tokens, len);
2357         rfc2047_token_list_free (tokens);
2358
2359         return decoded;
2360 }
2361
2362
2363 /* rfc2047 version of quoted-printable */
2364 static size_t
2365 quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask)
2366 {
2367         register const unsigned char *inptr = (const unsigned char *) in;
2368         const unsigned char *inend = inptr + len;
2369         register unsigned char *outptr = out;
2370         unsigned char c;
2371
2372         while (inptr < inend) {
2373                 c = *inptr++;
2374                 if (c == ' ') {
2375                         *outptr++ = '_';
2376                 } else if (c != '_' && gmime_special_table[c] & safemask) {
2377                         *outptr++ = c;
2378                 } else {
2379                         *outptr++ = '=';
2380                         *outptr++ = tohex[(c >> 4) & 0xf];
2381                         *outptr++ = tohex[c & 0xf];
2382                 }
2383         }
2384
2385         return (outptr - out);
2386 }
2387
2388 static void
2389 rfc2047_encode_word (GString *string, const char *word, size_t len,
2390                      const char *charset, gushort safemask)
2391 {
2392         register char *inptr, *outptr;
2393         iconv_t cd = (iconv_t) -1;
2394         unsigned char *encoded;
2395         size_t enclen, pos;
2396         char *uword = NULL;
2397         guint32 save = 0;
2398         int state = 0;
2399         char encoding;
2400
2401         if (g_ascii_strcasecmp (charset, "UTF-8") != 0)
2402                 cd = g_mime_iconv_open (charset, "UTF-8");
2403
2404         if (cd != (iconv_t) -1) {
2405                 uword = g_mime_iconv_strndup (cd, (char *) word, len);
2406                 g_mime_iconv_close (cd);
2407         }
2408
2409         if (uword) {
2410                 len = strlen (uword);
2411                 word = uword;
2412         } else {
2413                 charset = "UTF-8";
2414         }
2415
2416         switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) {
2417         case GMIME_CONTENT_ENCODING_BASE64:
2418                 enclen = GMIME_BASE64_ENCODE_LEN (len);
2419                 encoded = g_alloca (enclen + 1);
2420
2421                 encoding = 'b';
2422
2423                 pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save);
2424                 encoded[pos] = '\0';
2425
2426                 /* remove \n chars as headers need to be wrapped differently */
2427                 if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) {
2428                         outptr = inptr++;
2429                         while (G_LIKELY (*inptr)) {
2430                                 if (G_LIKELY (*inptr != '\n'))
2431                                         *outptr++ = *inptr;
2432
2433                                 inptr++;
2434                         }
2435
2436                         *outptr = '\0';
2437                 }
2438
2439                 break;
2440         case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
2441                 enclen = GMIME_QP_ENCODE_LEN (len);
2442                 encoded = g_alloca (enclen + 1);
2443
2444                 encoding = 'q';
2445
2446                 pos = quoted_encode (word, len, encoded, safemask);
2447                 encoded[pos] = '\0';
2448
2449                 break;
2450         default:
2451                 encoded = NULL;
2452                 encoding = '\0';
2453                 g_assert_not_reached ();
2454         }
2455
2456         g_free (uword);
2457
2458         g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded);
2459 }
2460
2461
2462 typedef enum {
2463         WORD_ATOM,
2464         WORD_QSTRING,
2465         WORD_2047
2466 } rfc822_word_t;
2467
2468 typedef struct _rfc822_word {
2469         struct _rfc822_word *next;
2470         const char *start, *end;
2471         rfc822_word_t type;
2472         int encoding;
2473 } rfc822_word;
2474
2475 #define rfc822_word_free(word) g_slice_free (rfc822_word, word)
2476 #define rfc822_word_new() g_slice_new (rfc822_word)
2477
2478 /* okay, so 'unstructured text' fields don't actually contain 'word'
2479  * tokens, but we can group stuff similarly... */
2480 static rfc822_word *
2481 rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
2482 {
2483         rfc822_word words, *tail, *word;
2484         rfc822_word_t type = WORD_ATOM;
2485         const char *inptr, *start, *last;
2486         int count = 0, encoding = 0;
2487
2488         tail = (rfc822_word *) &words;
2489         words.next = NULL;
2490
2491         last = start = inptr = in;
2492         while (inptr && *inptr) {
2493                 const char *newinptr;
2494                 gunichar c;
2495
2496                 newinptr = g_utf8_next_char (inptr);
2497                 c = g_utf8_get_char (inptr);
2498                 if (newinptr == NULL || !g_unichar_validate (c)) {
2499                         w(g_warning ("Invalid UTF-8 sequence encountered"));
2500                         inptr++;
2501                         continue;
2502                 }
2503
2504                 inptr = newinptr;
2505
2506                 if (c < 256 && is_blank (c)) {
2507                         if (count > 0) {
2508                                 word = rfc822_word_new ();
2509                                 word->next = NULL;
2510                                 word->start = start;
2511                                 word->end = last;
2512                                 word->type = type;
2513                                 word->encoding = encoding;
2514
2515                                 tail->next = word;
2516                                 tail = word;
2517                                 count = 0;
2518                         }
2519
2520                         start = inptr;
2521                         type = WORD_ATOM;
2522                         encoding = 0;
2523                 } else {
2524                         count++;
2525                         if (c < 128) {
2526                                 if (is_ctrl (c)) {
2527                                         type = WORD_2047;
2528                                         encoding = MAX (encoding, 1);
2529                                 } else if (phrase && !is_atom (c)) {
2530                                         /* phrases can have qstring words */
2531                                         type = MAX (type, WORD_QSTRING);
2532                                 }
2533                         } else if (c < 256) {
2534                                 type = WORD_2047;
2535                                 encoding = MAX (encoding, 1);
2536                         } else {
2537                                 type = WORD_2047;
2538                                 encoding = 2;
2539                         }
2540
2541                         if (count >= GMIME_FOLD_PREENCODED) {
2542                                 if (type == WORD_ATOM)
2543                                         type = WORD_2047;
2544
2545                                 word = rfc822_word_new ();
2546                                 word->next = NULL;
2547                                 word->start = start;
2548                                 word->end = inptr;
2549                                 word->type = type;
2550                                 word->encoding = encoding;
2551
2552                                 tail->next = word;
2553                                 tail = word;
2554                                 count = 0;
2555
2556                                 /* Note: don't reset 'type' as it
2557                                  * needs to be preserved when breaking
2558                                  * long words */
2559                                 start = inptr;
2560                                 encoding = 0;
2561                         }
2562                 }
2563
2564                 last = inptr;
2565         }
2566
2567         if (count > 0) {
2568                 word = rfc822_word_new ();
2569                 word->next = NULL;
2570                 word->start = start;
2571                 word->end = last;
2572                 word->type = type;
2573                 word->encoding = encoding;
2574
2575                 tail->next = word;
2576                 tail = word;
2577         }
2578
2579 #if d(!)0
2580         printf ("rfc822 word tokens:\n");
2581         word = words.next;
2582         while (word) {
2583                 printf ("\t'%.*s'; type=%d, encoding=%d\n",
2584                         word->end - word->start, word->start,
2585                         word->type, word->encoding);
2586
2587                 word = word->next;
2588         }
2589 #endif
2590
2591         return words.next;
2592 }
2593
2594 #define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))
2595
2596 static gboolean
2597 should_merge_words (rfc822_word *word, rfc822_word *next)
2598 {
2599         switch (word->type) {
2600         case WORD_ATOM:
2601                 if (next->type == WORD_2047)
2602                         return FALSE;
2603
2604                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type));
2605         case WORD_QSTRING:
2606                 /* avoid merging with words that need to be rfc2047 encoded */
2607                 if (next->type == WORD_2047)
2608                         return FALSE;
2609
2610                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING));
2611         case WORD_2047:
2612                 if (next->type == WORD_ATOM) {
2613                         /* whether we merge or not is dependent upon:
2614                          * 1. the number of atoms in a row after 'word'
2615                          * 2. if there is another encword after the string of atoms.
2616                          */
2617                         int natoms = 0;
2618
2619                         while (next && next->type == WORD_ATOM) {
2620                                 next = next->next;
2621                                 natoms++;
2622                         }
2623
2624                         /* if all the words after the encword are atoms, don't merge */
2625                         if (!next || natoms > 3)
2626                                 return FALSE;
2627                 }
2628
2629                 /* avoid merging with qstrings */
2630                 if (next->type == WORD_QSTRING)
2631                         return FALSE;
2632
2633                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047));
2634         default:
2635                 return FALSE;
2636         }
2637 }
2638
2639 static void
2640 rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp)
2641 {
2642         rfc822_word *word, *next, *words = *wordsp;
2643
2644         /* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */
2645         word = words;
2646         while (word && word->next) {
2647                 next = word->next;
2648
2649                 if (word->type != WORD_ATOM && word->type == next->type &&
2650                     MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) {
2651                         /* merge the words */
2652                         word->encoding = MAX (word->encoding, next->encoding);
2653
2654                         word->end = next->end;
2655                         word->next = next->next;
2656
2657                         rfc822_word_free (next);
2658
2659                         next = word;
2660                 }
2661
2662                 word = next;
2663         }
2664
2665         /* second pass: now merge atoms with the other words */
2666         word = words;
2667         while (word && word->next) {
2668                 next = word->next;
2669
2670                 if (should_merge_words (word, next)) {
2671                         /* the resulting word type is the MAX of the 2 types */
2672                         word->type = MAX (word->type, next->type);
2673
2674                         word->encoding = MAX (word->encoding, next->encoding);
2675
2676                         word->end = next->end;
2677                         word->next = next->next;
2678
2679                         rfc822_word_free (next);
2680
2681                         continue;
2682                 }
2683
2684                 word = next;
2685         }
2686
2687         *wordsp = words;
2688 }
2689
2690 static void
2691 g_string_append_len_quoted (GString *out, const char *in, size_t len)
2692 {
2693         register const char *inptr;
2694         const char *inend;
2695
2696         g_string_append_c (out, '"');
2697
2698         inptr = in;
2699         inend = in + len;
2700
2701         while (inptr < inend) {
2702                 if (*inptr == '"' || *inptr == '\\')
2703                         g_string_append_c (out, '\\');
2704
2705                 g_string_append_c (out, *inptr);
2706
2707                 inptr++;
2708         }
2709
2710         g_string_append_c (out, '"');
2711 }
2712
2713 static char *
2714 rfc2047_encode (const char *in, gushort safemask)
2715 {
2716         rfc822_word *words, *word, *prev = NULL;
2717         const char **charsets, *charset;
2718         const char *start;
2719         GMimeCharset mask;
2720         GString *out;
2721         char *outstr;
2722         size_t len;
2723         int i;
2724
2725         if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE)))
2726                 return g_strdup (in);
2727
2728         rfc2047_encode_merge_rfc822_words (&words);
2729
2730         charsets = g_mime_user_charsets ();
2731
2732         out = g_string_new ("");
2733
2734         /* output words now with spaces between them */
2735         word = words;
2736         while (word) {
2737                 /* append correct number of spaces between words */
2738                 if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) {
2739                         /* one or both of the words are not encoded so we write the spaces out untouched */
2740                         len = word->start - prev->end;
2741                         g_string_append_len (out, prev->end, len);
2742                 }
2743
2744                 switch (word->type) {
2745                 case WORD_ATOM:
2746                         g_string_append_len (out, word->start, (size_t) (word->end - word->start));
2747                         break;
2748                 case WORD_QSTRING:
2749                         g_assert (safemask & IS_PSAFE);
2750                         g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start));
2751                         break;
2752                 case WORD_2047:
2753                         if (prev && prev->type == WORD_2047) {
2754                                 /* include the whitespace chars between these 2 words in the
2755                                    resulting rfc2047 encoded word. */
2756                                 len = word->end - prev->end;
2757                                 start = prev->end;
2758
2759                                 /* encoded words need to be separated by linear whitespace */
2760                                 g_string_append_c (out, ' ');
2761                         } else {
2762                                 len = word->end - word->start;
2763                                 start = word->start;
2764                         }
2765
2766                         switch (word->encoding) {
2767                         case 0: /* us-ascii */
2768                                 rfc2047_encode_word (out, start, len, "us-ascii", safemask);
2769                                 break;
2770                         case 1: /* iso-8859-1 */
2771                                 rfc2047_encode_word (out, start, len, "iso-8859-1", safemask);
2772                                 break;
2773                         default:
2774                                 charset = NULL;
2775                                 g_mime_charset_init (&mask);
2776                                 g_mime_charset_step (&mask, start, len);
2777
2778                                 for (i = 0; charsets && charsets[i]; i++) {
2779                                         if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) {
2780                                                 charset = charsets[i];
2781                                                 break;
2782                                         }
2783                                 }
2784
2785                                 if (!charset)
2786                                         charset = g_mime_charset_best_name (&mask);
2787
2788                                 rfc2047_encode_word (out, start, len, charset, safemask);
2789                                 break;
2790                         }
2791
2792                         break;
2793                 }
2794
2795                 rfc822_word_free (prev);
2796
2797                 prev = word;
2798                 word = word->next;
2799         }
2800
2801         rfc822_word_free (prev);
2802
2803         outstr = out->str;
2804         g_string_free (out, FALSE);
2805
2806         return outstr;
2807 }
2808
2809
2810 /**
2811  * g_mime_utils_header_encode_phrase:
2812  * @phrase: phrase to encode
2813  *
2814  * Encodes a 'phrase' header according to the rules in rfc2047.
2815  *
2816  * Returns: the encoded 'phrase'. Useful for encoding internet
2817  * addresses.
2818  **/
2819 char *
2820 g_mime_utils_header_encode_phrase (const char *phrase)
2821 {
2822         if (phrase == NULL)
2823                 return NULL;
2824
2825         return rfc2047_encode (phrase, IS_PSAFE);
2826 }
2827
2828
2829 /**
2830  * g_mime_utils_header_encode_text:
2831  * @text: text to encode
2832  *
2833  * Encodes a 'text' header according to the rules in rfc2047.
2834  *
2835  * Returns: the encoded header. Useful for encoding
2836  * headers like "Subject".
2837  **/
2838 char *
2839 g_mime_utils_header_encode_text (const char *text)
2840 {
2841         if (text == NULL)
2842                 return NULL;
2843
2844         return rfc2047_encode (text, IS_ESAFE);
2845 }