camel/camel-mime-part-utils.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; fill-column: 160 -*- */
   2 /* camel-mime-part-utils : Utility for mime parsing and so on
   3  *
   4  * Authors: Bertrand Guiheneuf <bertrand@helixcode.com>
   5  *          Michael Zucchi <notzed@ximian.com>
   6  *          Jeffrey Stedfast <fejj@ximian.com>
   7  *
   8  * Copyright 1999, 2000 Ximian, Inc. (www.ximian.com)
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of version 2 of the GNU General Public
  12  * License as published by the Free Software Foundation.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  22  * USA
  23  */
  24
  25 #ifdef HAVE_CONFIG_H
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <unistd.h>
  32 #include <ctype.h>
  33 #include <errno.h>
  34
  35 #include <gal/util/e-iconv.h>
  36
  37 #include "camel-charset-map.h"
  38 #include "camel-mime-part-utils.h"
  39 #include "camel-mime-message.h"
  40 #include "camel-multipart.h"
  41 #include "camel-multipart-signed.h"
  42 #include "camel-multipart-encrypted.h"
  43 #include "camel-seekable-substream.h"
  44 #include "camel-stream-fs.h"
  45 #include "camel-stream-filter.h"
  46 #include "camel-stream-mem.h"
  47 #include "camel-mime-filter-basic.h"
  48 #include "camel-mime-filter-charset.h"
  49 #include "camel-mime-filter-crlf.h"
  50 #include "camel-mime-filter-save.h"
  51 #include "camel-html-parser.h"
  52
  53 #define d(x) /*(printf("%s(%d): ", __FILE__, __LINE__),(x))
  54                #include <stdio.h>*/
  55
  56 /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
  57
  58 static const char *
  59 check_html_charset(char *buffer, int length)
  60 {
  61         CamelHTMLParser *hp;
  62         const char *charset = NULL;
  63         camel_html_parser_t state;
  64         struct _header_content_type *ct;
  65
  66         /* if we need to first base64/qp decode, do this here, sigh */
  67         hp = camel_html_parser_new();
  68         camel_html_parser_set_data(hp, buffer, length, TRUE);
  69
  70         do {
  71                 const char *data;
  72                 int len;
  73                 const char *val;
  74
  75                 state = camel_html_parser_step(hp, &data, &len);
  76
  77                 /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
  78
  79                 switch(state) {
  80                 case CAMEL_HTML_PARSER_ELEMENT:
  81                         val = camel_html_parser_tag(hp);
  82                         d(printf("Got tag: %s\n", val));
  83                         if (strcasecmp(val, "meta") == 0
  84                             && (val = camel_html_parser_attr(hp, "http-equiv"))
  85                             && strcasecmp(val, "content-type") == 0
  86                             && (val = camel_html_parser_attr(hp, "content"))
  87                             && (ct = header_content_type_decode(val))) {
  88                                 charset = header_content_type_param(ct, "charset");
  89                                 charset = e_iconv_charset_name (charset);
  90                                 header_content_type_unref(ct);
  91                         }
  92                         break;
  93                 default:
  94                         /* ignore everything else */
  95                         break;
  96                 }
  97         } while (charset == NULL && state != CAMEL_HTML_PARSER_EOF);
  98
  99         camel_object_unref (hp);
 100
 101         return charset;
 102 }
 103
 104 static GByteArray *
 105 convert_buffer (GByteArray *in, const char *to, const char *from)
 106 {
 107         size_t inleft, outleft, outlen, converted = 0;
 108         GByteArray *out = NULL;
 109         const char *inbuf;
 110         char *outbuf;
 111         iconv_t cd;
 112
 113         if (in->len == 0)
 114                 return g_byte_array_new();
 115
 116         d(printf("converting buffer from %s to %s:\n", from, to));
 117         d(fwrite(in->data, 1, (int)in->len, stdout));
 118         d(printf("\n"));
 119
 120         cd = e_iconv_open(to, from);
 121         if (cd == (iconv_t) -1) {
 122                 g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
 123                 return NULL;
 124         }
 125
 126         outlen = in->len * 2 + 16;
 127         out = g_byte_array_new ();
 128         g_byte_array_set_size (out, outlen);
 129
 130         inbuf = in->data;
 131         inleft = in->len;
 132
 133         do {
 134                 outbuf = out->data + converted;
 135                 outleft = outlen - converted;
 136
 137                 converted = e_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 138                 if (converted == (size_t) -1) {
 139                         if (errno != E2BIG && errno != EINVAL)
 140                                 goto fail;
 141                 }
 142
 143                 /*
 144                  * E2BIG   There is not sufficient room at *outbuf.
 145                  *
 146                  * We just need to grow our outbuffer and try again.
 147                  */
 148
 149                 converted = outbuf - (char *)out->data;
 150                 if (errno == E2BIG) {
 151                         outlen += inleft * 2 + 16;
 152                         out = g_byte_array_set_size (out, outlen);
 153                         outbuf = out->data + converted;
 154                 }
 155
 156         } while (errno == E2BIG && inleft > 0);
 157
 158         /*
 159          * EINVAL  An  incomplete  multibyte sequence has been encoun
 160          *         tered in the input.
 161          *
 162          * We'll just have to ignore it...
 163          */
 164
 165         /* flush the iconv conversion */
 166         e_iconv (cd, NULL, NULL, &outbuf, &outleft);
 167
 168         /* now set the true length on the GByteArray */
 169         converted = outbuf - (char *)out->data;
 170         g_byte_array_set_size (out, converted);
 171
 172         d(printf("converted data:\n"));
 173         d(fwrite(out->data, 1, (int)out->len, stdout));
 174         d(printf("\n"));
 175
 176         e_iconv_close (cd);
 177
 178         return out;
 179
 180  fail:
 181         g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
 182
 183         g_byte_array_free (out, TRUE);
 184
 185         e_iconv_close (cd);
 186
 187         return NULL;
 188 }
 189
 190 /* We don't really use the charset argument except for debugging... */
 191 static gboolean
 192 broken_windows_charset (GByteArray *buffer, const char *charset)
 193 {
 194         register unsigned char *inptr;
 195         unsigned char *inend;
 196
 197         inptr = buffer->data;
 198         inend = inptr + buffer->len;
 199
 200         while (inptr < inend) {
 201                 register unsigned char c = *inptr++;
 202
 203                 if (c >= 128 && c <= 159) {
 204                         g_warning ("Encountered Windows charset parading as %s", charset);
 205                         return TRUE;
 206                 }
 207         }
 208
 209         return FALSE;
 210 }
 211
 212 static gboolean
 213 is_7bit (GByteArray *buffer)
 214 {
 215         register unsigned int i;
 216
 217         for (i = 0; i < buffer->len; i++)
 218                 if (buffer->data[i] > 127)
 219                         return FALSE;
 220
 221         return TRUE;
 222 }
 223
 224 static const char *iso_charsets[] = {
 225         "us-ascii",
 226         "iso-8859-1",
 227         "iso-8859-2",
 228         "iso-8859-3",
 229         "iso-8859-4",
 230         "iso-8859-5",
 231         "iso-8859-6",
 232         "iso-8859-7",
 233         "iso-8859-8",
 234         "iso-8859-9",
 235         "iso-8859-10",
 236         "iso-8859-11",
 237         "iso-8859-12",
 238         "iso-8859-13",
 239         "iso-8859-14",
 240         "iso-8859-15",
 241         "iso-8859-16"
 242 };
 243
 244 #define NUM_ISO_CHARSETS (sizeof (iso_charsets) / sizeof (iso_charsets[0]))
 245
 246 static const char *
 247 canon_charset_name (const char *charset)
 248 {
 249         const char *ptr;
 250         char *endptr;
 251         int iso;
 252
 253         if (strncasecmp (charset, "iso", 3) != 0)
 254                 return charset;
 255
 256         ptr = charset + 3;
 257         if (*ptr == '-' || *ptr == '_')
 258                 ptr++;
 259
 260         /* if it's not an iso-8859-# charset, we don't care about it */
 261         if (strncmp (ptr, "8859", 4) != 0)
 262                 return charset;
 263
 264         ptr += 4;
 265         if (*ptr == '-' || *ptr == '_')
 266                 ptr++;
 267
 268         iso = strtoul (ptr, &endptr, 10);
 269         if (endptr == ptr || *endptr != '\0')
 270                 return charset;
 271
 272         if (iso >= NUM_ISO_CHARSETS)
 273                 return charset;
 274
 275         return iso_charsets[iso];
 276 }
 277
 278 /* simple data wrapper */
 279 static void
 280 simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
 281 {
 282         CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
 283         CamelMimeFilterBasicType enctype = 0;
 284         size_t len;
 285         int decid = -1, crlfid = -1;
 286         struct _header_content_type *ct;
 287         const char *charset = NULL;
 288         char *encoding, *buf;
 289         GByteArray *buffer;
 290         CamelStream *mem;
 291
 292         d(printf ("simple_data_wrapper_construct_from_parser()\n"));
 293
 294         /* first, work out conversion, if any, required, we dont care about what we dont know about */
 295         encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL));
 296         if (encoding) {
 297                 if (!strcasecmp (encoding, "base64")) {
 298                         d(printf("Adding base64 decoder ...\n"));
 299                         enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
 300                 } else if (!strcasecmp (encoding, "quoted-printable")) {
 301                         d(printf("Adding quoted-printable decoder ...\n"));
 302                         enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
 303                 } else if (!strcasecmp (encoding, "x-uuencode")) {
 304                         d(printf("Adding uudecoder ...\n"));
 305                         enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC;
 306                 }
 307                 g_free (encoding);
 308
 309                 if (enctype != 0) {
 310                         fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
 311                         decid = camel_mime_parser_filter_add (mp, fdec);
 312                 }
 313         }
 314
 315         /* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */
 316         ct = camel_mime_parser_content_type (mp);
 317         if (header_content_type_is (ct, "text", "*")) {
 318                 charset = header_content_type_param (ct, "charset");
 319                 charset = e_iconv_charset_name (charset);
 320
 321                 if (fdec) {
 322                         d(printf ("Adding CRLF conversion filter\n"));
 323                         fcrlf = camel_mime_filter_crlf_new (CAMEL_MIME_FILTER_CRLF_DECODE,
 324                                                             CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY);
 325                         crlfid = camel_mime_parser_filter_add (mp, fcrlf);
 326                 }
 327         }
 328
 329         /* read in the entire content */
 330         buffer = g_byte_array_new ();
 331         while (camel_mime_parser_step (mp, &buf, &len) != HSCAN_BODY_END) {
 332                 d(printf("appending o/p data: %d: %.*s\n", len, len, buf));
 333                 g_byte_array_append (buffer, buf, len);
 334         }
 335
 336         /* check for broken Outlook/Web mailers that like to send html marked as text/plain */
 337         if (header_content_type_is (ct, "text", "plain")) {
 338                 register const unsigned char *inptr;
 339                 const unsigned char *inend;
 340
 341                 inptr = buffer->data;
 342                 inend = inptr + buffer->len;
 343
 344                 while (inptr < inend && isspace ((int) *inptr))
 345                         inptr++;
 346
 347                 if (((inend-inptr) > 5 && g_ascii_strncasecmp(inptr, "<html", 5) == 0)
 348                     || ((inend-inptr) > 9 && g_ascii_strncasecmp(inptr, "<!doctype", 9) == 0)) {
 349                         /* re-tag as text/html */
 350                         g_free (ct->subtype);
 351                         ct->subtype = g_strdup ("html");
 352                 }
 353         }
 354
 355         /* Possible Lame Mailer Alert... check the META tags for a charset */
 356         if (!charset && header_content_type_is (ct, "text", "html")) {
 357                 if ((charset = check_html_charset (buffer->data, buffer->len)))
 358                         header_content_type_set_param (ct, "charset", charset);
 359         }
 360
 361         /* if we need to do charset conversion, see if we can/it works/etc */
 362         if (charset && !(strcasecmp (charset, "us-ascii") == 0
 363                          || strcasecmp (charset, "utf-8") == 0
 364                          || strncasecmp (charset, "x-", 2) == 0)) {
 365                 GByteArray *out;
 366
 367                 /* You often see Microsoft Windows users announcing their texts
 368                  * as being in ISO-8859-1 even when in fact they contain funny
 369                  * characters from the Windows-CP1252 superset.
 370                  */
 371                 charset = canon_charset_name (charset);
 372                 if (!strncasecmp (charset, "iso-8859", 8)) {
 373                         /* check for Windows-specific chars... */
 374                         if (broken_windows_charset (buffer, charset))
 375                                 charset = camel_charset_iso_to_windows (charset);
 376                 }
 377
 378                 out = convert_buffer (buffer, "UTF-8", charset);
 379                 if (out) {
 380                         /* converted ok, use this data instead */
 381                         g_byte_array_free(buffer, TRUE);
 382                         dw->rawtext = FALSE;
 383                         buffer = out;
 384                 } else {
 385                         /* else failed to convert, leave as raw? */
 386                         g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
 387                         dw->rawtext = TRUE;
 388                 }
 389         } else if (header_content_type_is (ct, "text", "*")) {
 390                 if (charset == NULL || !strcasecmp (charset, "us-ascii")) {
 391                         /* check that it's 7bit */
 392                         dw->rawtext = !is_7bit (buffer);
 393                 } else if (!strncasecmp (charset, "x-", 2)) {
 394                         /* we're not even going to bother trying to convert, so set the
 395                            rawtext bit to TRUE and let the mailer deal with it. */
 396                         dw->rawtext = TRUE;
 397                 } else if (!strcasecmp (charset, "utf-8") && buffer->len) {
 398                         /* check that it is valid utf8 */
 399                         dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL);
 400                 }
 401         }
 402
 403         d(printf("message part kept in memory!\n"));
 404
 405         mem = camel_stream_mem_new_with_byte_array(buffer);
 406         camel_data_wrapper_construct_from_stream(dw, mem);
 407         camel_object_unref((CamelObject *)mem);
 408
 409         camel_mime_parser_filter_remove(mp, decid);
 410         camel_mime_parser_filter_remove(mp, crlfid);
 411
 412         if (fdec)
 413                 camel_object_unref((CamelObject *)fdec);
 414         if (fcrlf)
 415                 camel_object_unref((CamelObject *)fcrlf);
 416 }
 417
 418 /* This replaces the data wrapper repository ... and/or could be replaced by it? */
 419 void
 420 camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParser *mp)
 421 {
 422         CamelDataWrapper *content = NULL;
 423         CamelContentType *ct;
 424
 425         ct = camel_mime_parser_content_type (mp);
 426
 427         switch (camel_mime_parser_state (mp)) {
 428         case HSCAN_HEADER:
 429                 d(printf("Creating body part\n"));
 430                 /* multipart/signed is some fucked up type that we must treat as binary data, fun huh, idiots. */
 431                 if (header_content_type_is (ct, "multipart", "signed")) {
 432                         content = (CamelDataWrapper *) camel_multipart_signed_new ();
 433                         camel_multipart_construct_from_parser ((CamelMultipart *) content, mp);
 434                 } else {
 435                         content = camel_data_wrapper_new ();
 436                         simple_data_wrapper_construct_from_parser (content, mp);
 437                 }
 438                 break;
 439         case HSCAN_MESSAGE:
 440                 d(printf("Creating message part\n"));
 441                 content = (CamelDataWrapper *) camel_mime_message_new ();
 442                 camel_mime_part_construct_from_parser ((CamelMimePart *)content, mp);
 443                 break;
 444         case HSCAN_MULTIPART:
 445                 d(printf("Creating multi-part\n"));
 446                 if (header_content_type_is (ct, "multipart", "encrypted"))
 447                         content = (CamelDataWrapper *) camel_multipart_encrypted_new ();
 448                 else if (header_content_type_is (ct, "multipart", "signed"))
 449                         content = (CamelDataWrapper *) camel_multipart_signed_new ();
 450                 else
 451                         content = (CamelDataWrapper *) camel_multipart_new ();
 452
 453                 camel_multipart_construct_from_parser((CamelMultipart *)content, mp);
 454                 d(printf("Created multi-part\n"));
 455                 break;
 456         default:
 457                 g_warning("Invalid state encountered???: %d", camel_mime_parser_state (mp));
 458         }
 459         if (content) {
 460                 /* would you believe you have to set this BEFORE you set the content object???  oh my god !!!! */
 461                 camel_data_wrapper_set_mime_type_field (content, camel_mime_part_get_content_type (dw));
 462                 camel_medium_set_content_object ((CamelMedium *)dw, content);
 463
 464                 /* Note: we don't set ct as the content-object's mime-type above because
 465                  * camel_medium_set_content_object() may re-write the Content-Type header
 466                  * (see CamelMimePart::set_content_object) if we did that (which is a Bad Thing).
 467                  * However, if we set it *afterward*, we can still use any special auto-detections
 468                  * that we found in simple_data_wrapper_construct_from_parser(). This is important
 469                  * later when we go to render the MIME parts in mail-format.c */
 470                 camel_data_wrapper_set_mime_type_field (content, ct);
 471
 472                 camel_object_unref (content);
 473         }
 474 }