Initialize the gmime for upstream
[platform/upstream/gmime.git] / gmime / gmime-filter-html.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*  GMime
3  *  Copyright (C) 2000-2012 Jeffrey Stedfast
4  *
5  *  This library is free software; you can redistribute it and/or
6  *  modify it under the terms of the GNU Lesser General Public License
7  *  as published by the Free Software Foundation; either version 2.1
8  *  of the License, or (at your option) any later version.
9  *
10  *  This library is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  *  Lesser General Public License for more details.
14  *
15  *  You should have received a copy of the GNU Lesser General Public
16  *  License along with this library; if not, write to the Free
17  *  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
18  *  02110-1301, USA.
19  */
20
21
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <stdio.h>
27 #include <string.h>
28
29 #include "url-scanner.h"
30 #include "gmime-filter-html.h"
31
32 #define d(x)
33
34
35 /**
36  * SECTION: gmime-filter-html
37  * @title: GMimeFilterHTML
38  * @short_description: Convert plain text into HTML
39  * @see_also: #GMimeFilter
40  *
41  * A #GMimeFilter used for converting plain text into HTML.
42  **/
43
44
45 #define CONVERT_WEB_URLS  GMIME_FILTER_HTML_CONVERT_URLS
46 #define CONVERT_ADDRSPEC  GMIME_FILTER_HTML_CONVERT_ADDRESSES
47
48 static struct {
49         unsigned int mask;
50         urlpattern_t pattern;
51 } patterns[] = {
52         { CONVERT_WEB_URLS, { "file://",   "",        url_file_start,     url_file_end     } },
53         { CONVERT_WEB_URLS, { "ftp://",    "",        url_web_start,      url_web_end      } },
54         { CONVERT_WEB_URLS, { "http://",   "",        url_web_start,      url_web_end      } },
55         { CONVERT_WEB_URLS, { "https://",  "",        url_web_start,      url_web_end      } },
56         { CONVERT_WEB_URLS, { "news://",   "",        url_web_start,      url_web_end      } },
57         { CONVERT_WEB_URLS, { "nntp://",   "",        url_web_start,      url_web_end      } },
58         { CONVERT_WEB_URLS, { "telnet://", "",        url_web_start,      url_web_end      } },
59         { CONVERT_WEB_URLS, { "www.",      "http://", url_web_start,      url_web_end      } },
60         { CONVERT_WEB_URLS, { "ftp.",      "ftp://",  url_web_start,      url_web_end      } },
61         { CONVERT_ADDRSPEC, { "@",         "mailto:", url_addrspec_start, url_addrspec_end } },
62 };
63
64 #define NUM_URL_PATTERNS (sizeof (patterns) / sizeof (patterns[0]))
65
66 static void g_mime_filter_html_class_init (GMimeFilterHTMLClass *klass);
67 static void g_mime_filter_html_init (GMimeFilterHTML *filter, GMimeFilterHTMLClass *klass);
68 static void g_mime_filter_html_finalize (GObject *object);
69
70 static GMimeFilter *filter_copy (GMimeFilter *filter);
71 static void filter_filter (GMimeFilter *filter, char *in, size_t len, size_t prespace,
72                            char **out, size_t *outlen, size_t *outprespace);
73 static void filter_complete (GMimeFilter *filter, char *in, size_t len, size_t prespace,
74                              char **out, size_t *outlen, size_t *outprespace);
75 static void filter_reset (GMimeFilter *filter);
76
77
78 static GMimeFilterClass *parent_class = NULL;
79
80
81 GType
82 g_mime_filter_html_get_type (void)
83 {
84         static GType type = 0;
85         
86         if (!type) {
87                 static const GTypeInfo info = {
88                         sizeof (GMimeFilterHTMLClass),
89                         NULL, /* base_class_init */
90                         NULL, /* base_class_finalize */
91                         (GClassInitFunc) g_mime_filter_html_class_init,
92                         NULL, /* class_finalize */
93                         NULL, /* class_data */
94                         sizeof (GMimeFilterHTML),
95                         0,    /* n_preallocs */
96                         (GInstanceInitFunc) g_mime_filter_html_init,
97                 };
98                 
99                 type = g_type_register_static (GMIME_TYPE_FILTER, "GMimeFilterHTML", &info, 0);
100         }
101         
102         return type;
103 }
104
105
106 static void
107 g_mime_filter_html_class_init (GMimeFilterHTMLClass *klass)
108 {
109         GObjectClass *object_class = G_OBJECT_CLASS (klass);
110         GMimeFilterClass *filter_class = GMIME_FILTER_CLASS (klass);
111         
112         parent_class = g_type_class_ref (GMIME_TYPE_FILTER);
113         
114         object_class->finalize = g_mime_filter_html_finalize;
115         
116         filter_class->copy = filter_copy;
117         filter_class->filter = filter_filter;
118         filter_class->complete = filter_complete;
119         filter_class->reset = filter_reset;
120 }
121
122 static void
123 g_mime_filter_html_init (GMimeFilterHTML *filter, GMimeFilterHTMLClass *klass)
124 {
125         filter->scanner = url_scanner_new ();
126         
127         filter->flags = 0;
128         filter->colour = 0;
129         filter->column = 0;
130         filter->pre_open = FALSE;
131 }
132
133 static void
134 g_mime_filter_html_finalize (GObject *object)
135 {
136         GMimeFilterHTML *html = (GMimeFilterHTML *) object;
137         
138         url_scanner_free (html->scanner);
139         
140         G_OBJECT_CLASS (parent_class)->finalize (object);
141 }
142
143
144 static GMimeFilter *
145 filter_copy (GMimeFilter *filter)
146 {
147         GMimeFilterHTML *html = (GMimeFilterHTML *) filter;
148         
149         return g_mime_filter_html_new (html->flags, html->colour);
150 }
151
152 static char *
153 check_size (GMimeFilter *filter, char *outptr, char **outend, size_t len)
154 {
155         size_t outleft = (size_t) (*outend - outptr);
156         size_t offset;
157         
158         if (outleft >= len)
159                 return outptr;
160         
161         offset = outptr - filter->outbuf;
162         
163         g_mime_filter_set_size (filter, filter->outsize + len, TRUE);
164         
165         *outend = filter->outbuf + filter->outsize;
166         
167         return filter->outbuf + offset;
168 }
169
170 static int
171 citation_depth (const char *in)
172 {
173         register const char *inptr = in;
174         int depth = 1;
175         
176         if (*inptr++ != '>')
177                 return 0;
178         
179         /* check that it isn't an escaped From line */
180         if (!strncmp (inptr, "From", 4))
181                 return 0;
182         
183         while (*inptr != '\n') {
184                 if (*inptr == ' ')
185                         inptr++;
186                 
187                 if (*inptr++ != '>')
188                         break;
189                 
190                 depth++;
191         }
192         
193         return depth;
194 }
195
196 static inline gunichar
197 html_utf8_getc (const unsigned char **in, const unsigned char *inend)
198 {
199         register const unsigned char *inptr = *in;
200         register unsigned char c, r;
201         register gunichar u, m;
202         
203         if (inptr == inend)
204                 return 0;
205         
206         while (inptr < inend) {
207                 r = *inptr++;
208         loop:
209                 if (r < 0x80) {
210                         *in = inptr;
211                         return r;
212                 } else if (r < 0xf8) { /* valid start char? */
213                         u = r;
214                         m = 0x7f80;     /* used to mask out the length bits */
215                         do {
216                                 if (inptr >= inend)
217                                         return 0xffff;
218                                 
219                                 c = *inptr++;
220                                 if ((c & 0xc0) != 0x80) {
221                                         r = c;
222                                         goto loop;
223                                 }
224                                 
225                                 u = (u << 6) | (c & 0x3f);
226                                 r <<= 1;
227                                 m <<= 5;
228                         } while (r & 0x40);
229                         
230                         *in = inptr;
231                         
232                         u &= ~m;
233                         
234                         return u;
235                 }
236         }
237         
238         return 0xffff;
239 }
240
241 static char *
242 writeln (GMimeFilter *filter, const char *in, const char *end, char *outptr, char **outend)
243 {
244         GMimeFilterHTML *html = (GMimeFilterHTML *) filter;
245         const unsigned char *instart = (const unsigned char *) in;
246         const unsigned char *inend = (const unsigned char *) end;
247         const unsigned char *inptr = instart;
248         
249         while (inptr < inend) {
250                 gunichar u;
251                 
252                 outptr = check_size (filter, outptr, outend, 16);
253                 
254                 u = html_utf8_getc (&inptr, inend);
255                 switch (u) {
256                 case 0xffff:
257                         g_warning ("Invalid UTF-8 sequence encountered");
258                         return outptr;
259                         break;
260                 case '<':
261                         outptr = g_stpcpy (outptr, "&lt;");
262                         html->column++;
263                         break;
264                 case '>':
265                         outptr = g_stpcpy (outptr, "&gt;");
266                         html->column++;
267                         break;
268                 case '&':
269                         outptr = g_stpcpy (outptr, "&amp;");
270                         html->column++;
271                         break;
272                 case '"':
273                         outptr = g_stpcpy (outptr, "&quot;");
274                         html->column++;
275                         break;
276                 case '\t':
277                         if (html->flags & (GMIME_FILTER_HTML_CONVERT_SPACES)) {
278                                 do {
279                                         outptr = check_size (filter, outptr, outend, 7);
280                                         outptr = g_stpcpy (outptr, "&nbsp;");
281                                         html->column++;
282                                 } while (html->column % 8);
283                                 break;
284                         }
285                         /* otherwise, FALL THROUGH */
286                 case ' ':
287                         if (html->flags & GMIME_FILTER_HTML_CONVERT_SPACES) {
288                                 if (inptr == (instart + 1) || (inptr < inend && (*inptr == ' ' || *inptr == '\t'))) {
289                                         outptr = g_stpcpy (outptr, "&nbsp;");
290                                         html->column++;
291                                         break;
292                                 }
293                         }
294                         /* otherwise, FALL THROUGH */
295                 default:
296                         if (u >= 0x20 && u < 0x80) {
297                                 *outptr++ = (char) (u & 0xff);
298                         } else {
299                                 if (html->flags & GMIME_FILTER_HTML_ESCAPE_8BIT)
300                                         *outptr++ = '?';
301                                 else
302                                         outptr += sprintf (outptr, "&#%u;", u);
303                         }
304                         html->column++;
305                         break;
306                 }
307         }
308         
309         return outptr;
310 }
311
312 static void
313 html_convert (GMimeFilter *filter, char *in, size_t inlen, size_t prespace,
314               char **out, size_t *outlen, size_t *outprespace, gboolean flush)
315 {
316         GMimeFilterHTML *html = (GMimeFilterHTML *) filter;
317         register char *inptr, *outptr;
318         char *start, *outend;
319         const char *inend;
320         int depth;
321         
322         g_mime_filter_set_size (filter, inlen * 2 + 6, FALSE);
323         
324         start = inptr = in;
325         inend = in + inlen;
326         outptr = filter->outbuf;
327         outend = filter->outbuf + filter->outsize;
328         
329         if (html->flags & GMIME_FILTER_HTML_PRE && !html->pre_open) {
330                 outptr = g_stpcpy (outptr, "<pre>");
331                 html->pre_open = TRUE;
332         }
333         
334         do {
335                 while (inptr < inend && *inptr != '\n')
336                         inptr++;
337                 
338                 if (inptr == inend && !flush)
339                         break;
340                 
341                 html->column = 0;
342                 depth = 0;
343                 
344                 if (html->flags & GMIME_FILTER_HTML_MARK_CITATION) {
345                         if ((depth = citation_depth (start)) > 0) {
346                                 char font[25];
347                                 
348                                 /* FIXME: we could easily support multiple colour depths here */
349                                 
350                                 g_snprintf (font, 25, "<font color=\"#%06x\">", html->colour);
351                                 
352                                 outptr = check_size (filter, outptr, &outend, 25);
353                                 outptr = g_stpcpy (outptr, font);
354                         } else if (*start == '>') {
355                                 /* >From line */
356                                 start++;
357                         }
358                 } else if (html->flags & GMIME_FILTER_HTML_CITE) {
359                         outptr = check_size (filter, outptr, &outend, 6);
360                         outptr = g_stpcpy (outptr, "&gt; ");
361                         html->column += 2;
362                 }
363                 
364 #define CONVERT_URLS_OR_ADDRESSES (GMIME_FILTER_HTML_CONVERT_URLS | GMIME_FILTER_HTML_CONVERT_ADDRESSES)
365                 if (html->flags & CONVERT_URLS_OR_ADDRESSES) {
366                         size_t matchlen, buflen, len;
367                         urlmatch_t match;
368                         
369                         len = inptr - start;
370                         
371                         do {
372                                 if (url_scanner_scan (html->scanner, start, len, &match)) {
373                                         /* write out anything before the first regex match */
374                                         outptr = writeln (filter, start, start + match.um_so,
375                                                           outptr, &outend);
376                                         
377                                         start += match.um_so;
378                                         len -= match.um_so;
379                                         
380                                         matchlen = match.um_eo - match.um_so;
381                                         
382                                         buflen = 20 + strlen (match.prefix) + matchlen + matchlen;
383                                         outptr = check_size (filter, outptr, &outend, buflen);
384                                         
385                                         /* write out the href tag */
386                                         outptr = g_stpcpy (outptr, "<a href=\"");
387                                         outptr = g_stpcpy (outptr, match.prefix);
388                                         memcpy (outptr, start, matchlen);
389                                         outptr += matchlen;
390                                         outptr = g_stpcpy (outptr, "\">");
391                                         
392                                         /* now write the matched string */
393                                         memcpy (outptr, start, matchlen);
394                                         html->column += matchlen;
395                                         outptr += matchlen;
396                                         start += matchlen;
397                                         len -= matchlen;
398                                         
399                                         /* close the href tag */
400                                         outptr = g_stpcpy (outptr, "</a>");
401                                 } else {
402                                         /* nothing matched so write out the remainder of this line buffer */
403                                         outptr = writeln (filter, start, start + len, outptr, &outend);
404                                         break;
405                                 }
406                         } while (len > 0);
407                 } else {
408                         outptr = writeln (filter, start, inptr, outptr, &outend);
409                 }
410                 
411                 if ((html->flags & GMIME_FILTER_HTML_MARK_CITATION) && depth > 0) {
412                         outptr = check_size (filter, outptr, &outend, 8);
413                         outptr = g_stpcpy (outptr, "</font>");
414                 }
415                 
416                 if (html->flags & GMIME_FILTER_HTML_CONVERT_NL) {
417                         outptr = check_size (filter, outptr, &outend, 5);
418                         outptr = g_stpcpy (outptr, "<br>");
419                 }
420                 
421                 if (inptr < inend)
422                         *outptr++ = '\n';
423                 
424                 start = ++inptr;
425         } while (inptr < inend);
426         
427         if (flush) {
428                 if (html->pre_open) {
429                         /* close the pre-tag */
430                         outptr = check_size (filter, outptr, &outend, 10);
431                         outptr = g_stpcpy (outptr, "</pre>");
432                 }
433         } else if (start < inend) {
434                 /* backup */
435                 g_mime_filter_backup (filter, start, (unsigned) (inend - start));
436         }
437         
438         *out = filter->outbuf;
439         *outlen = outptr - filter->outbuf;
440         *outprespace = filter->outpre;
441 }
442
443 static void
444 filter_filter (GMimeFilter *filter, char *in, size_t len, size_t prespace,
445                char **out, size_t *outlen, size_t *outprespace)
446 {
447         html_convert (filter, in, len, prespace, out, outlen, outprespace, FALSE);
448 }
449
450 static void 
451 filter_complete (GMimeFilter *filter, char *in, size_t len, size_t prespace,
452                  char **out, size_t *outlen, size_t *outprespace)
453 {
454         html_convert (filter, in, len, prespace, out, outlen, outprespace, TRUE);
455 }
456
457 static void
458 filter_reset (GMimeFilter *filter)
459 {
460         GMimeFilterHTML *html = (GMimeFilterHTML *) filter;
461         
462         html->column = 0;
463         html->pre_open = FALSE;
464 }
465
466
467 /**
468  * g_mime_filter_html_new:
469  * @flags: html flags
470  * @colour: citation colour
471  *
472  * Creates a new GMimeFilterHTML filter which can be used to convert a
473  * plain UTF-8 text stream into an html stream.
474  *
475  * Returns: a new html filter.
476  **/
477 GMimeFilter *
478 g_mime_filter_html_new (guint32 flags, guint32 colour)
479 {
480         GMimeFilterHTML *new;
481         guint i;
482         
483         new = g_object_newv (GMIME_TYPE_FILTER_HTML, 0, NULL);
484         new->flags = flags;
485         new->colour = colour;
486         
487         for (i = 0; i < NUM_URL_PATTERNS; i++) {
488                 if (patterns[i].mask & flags)
489                         url_scanner_add (new->scanner, &patterns[i].pattern);
490         }
491         
492         return (GMimeFilter *) new;
493 }