Fix FSF address (Tobias Mueller, #470445)
[platform/upstream/evolution-data-server.git] / camel / camel-html-parser.c
1 /*
2  *  Copyright (C) 2001 Ximian Inc.
3  *
4  *  Authors: Michael Zucchi <notzed@ximian.com>
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of version 2 of the GNU Lesser General Public
8  * License as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this program; if not, write to the
17  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  */
20
21 /* WARNING
22  *
23  * DO NOT USE THIS CODE OUTSIDE OF CAMEL
24  *
25  * IT IS SUBJECT TO CHANGE OR MAY VANISH AT ANY TIME
26  */
27
28 #include <ctype.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include <glib.h>
33
34 #include "camel-html-parser.h"
35
36 /* if defined, must also compile in dump_tag() below somewhere */
37 #define d(x)
38
39 static void camel_html_parser_class_init (CamelHTMLParserClass *klass);
40 static void camel_html_parser_init       (CamelObject *o);
41 static void camel_html_parser_finalize   (CamelObject *o);
42
43 static CamelObjectClass *camel_html_parser_parent;
44
45 /* Parser definitions, see below object code for details */
46
47 typedef struct _CamelHTMLParserPrivate CamelHTMLParserPrivate;
48
49 struct _CamelHTMLParserPrivate {
50         char *inbuf,
51                 *inptr,
52                 *inend,
53                 *start;
54         enum _camel_html_parser_t state;
55         char *charset;
56         int eof;
57         GString *tag;
58         GString *ent;
59         char ent_utf8[8];
60         int attr;
61         GPtrArray *attrs;
62         GPtrArray *values;
63         int quote;
64 };
65
66 static void tokenise_setup(void);
67 static CamelHTMLParserPrivate *tokenise_init(void);
68 static void tokenise_free(CamelHTMLParserPrivate *p);
69 static int tokenise_step(CamelHTMLParserPrivate *p, char **datap, int *lenp);
70
71 /* ********************************************************************** */
72
73 CamelType
74 camel_html_parser_get_type (void)
75 {
76         static CamelType type = CAMEL_INVALID_TYPE;
77         
78         if (type == CAMEL_INVALID_TYPE) {
79                 type = camel_type_register (camel_object_get_type (), "CamelHTMLParser",
80                                             sizeof (CamelHTMLParser),
81                                             sizeof (CamelHTMLParserClass),
82                                             (CamelObjectClassInitFunc) camel_html_parser_class_init,
83                                             NULL,
84                                             (CamelObjectInitFunc) camel_html_parser_init,
85                                             (CamelObjectFinalizeFunc) camel_html_parser_finalize);
86         }
87         
88         return type;
89 }
90
91 static void
92 camel_html_parser_finalize(CamelObject *o)
93 {
94         CamelHTMLParser *f = (CamelHTMLParser *)o;
95
96         tokenise_free(f->priv);
97 }
98
99 static void
100 camel_html_parser_init       (CamelObject *o)
101 {
102         CamelHTMLParser *f = (CamelHTMLParser *)o;
103
104         f->priv = tokenise_init();
105 }
106
107 static void
108 camel_html_parser_class_init (CamelHTMLParserClass *klass)
109 {
110         camel_html_parser_parent = CAMEL_OBJECT_CLASS (camel_type_get_global_classfuncs (camel_object_get_type ()));
111
112         tokenise_setup();
113 }
114
115 /**
116  * camel_html_parser_new:
117  *
118  * Create a new CamelHTMLParser object.
119  * 
120  * Return value: A new CamelHTMLParser widget.
121  **/
122 CamelHTMLParser *
123 camel_html_parser_new (void)
124 {
125         CamelHTMLParser *new = CAMEL_HTML_PARSER ( camel_object_new (camel_html_parser_get_type ()));
126         return new;
127 }
128
129
130 void camel_html_parser_set_data(CamelHTMLParser *hp, const char *start, int len, int last)
131 {
132         CamelHTMLParserPrivate *p = hp->priv;
133
134         p->inptr = p->inbuf = (char *)start;
135         p->inend = (char *)start+len;
136         p->eof = last;
137 }
138
139 camel_html_parser_t camel_html_parser_step(CamelHTMLParser *hp, const char **datap, int *lenp)
140 {
141         return tokenise_step(hp->priv, (char **)datap, lenp);
142 }
143
144 const char *camel_html_parser_left(CamelHTMLParser *hp, int *lenp)
145 {
146         CamelHTMLParserPrivate *p = hp->priv;
147
148         if (lenp)
149                 *lenp = p->inend - p->inptr;
150
151         return p->inptr;
152 }
153
154 const char *camel_html_parser_tag(CamelHTMLParser *hp)
155 {
156         return hp->priv->tag->str;
157 }
158
159 const char *camel_html_parser_attr(CamelHTMLParser *hp, const char *name)
160 {
161         int i;
162         CamelHTMLParserPrivate *p = hp->priv;
163
164         for (i=0;i<p->attrs->len;i++) {
165                 if (!g_ascii_strcasecmp(((GString *)p->attrs->pdata[i])->str, name)) {
166                         return ((GString *)p->values->pdata[i])->str;
167                 }
168         }
169
170         return NULL;
171 }
172
173 const GPtrArray *camel_html_parser_attr_list(CamelHTMLParser *hp, const GPtrArray **values)
174 {
175         if (values)
176                 *values = hp->priv->values;
177
178         return hp->priv->attrs;
179 }
180
181 /* this map taken out of libxml */
182 static struct {
183         unsigned int val;
184         const char *name;
185 } entity_map[] = {
186 /*
187  * the 4 absolute ones,
188  */
189         { 34,   "quot", /* quotation mark = APL quote, U+0022 ISOnum */ },
190         { 38,   "amp",  /* ampersand, U+0026 ISOnum */ },
191         { 60,   "lt",   /* less-than sign, U+003C ISOnum */ },
192         { 62,   "gt",   /* greater-than sign, U+003E ISOnum */ },
193
194 /*
195  * A bunch still in the 128-255 range
196  * Replacing them depend really on the charset used.
197  */
198         { 39,   "apos", /* single quote */ },
199         { 160,  "nbsp", /* no-break space = non-breaking space, U+00A0 ISOnum */ },
200         { 161,  "iexcl",/* inverted exclamation mark, U+00A1 ISOnum */ },
201         { 162,  "cent", /* cent sign, U+00A2 ISOnum */ },
202         { 163,  "pound",/* pound sign, U+00A3 ISOnum */ },
203         { 164,  "curren",/* currency sign, U+00A4 ISOnum */ },
204         { 165,  "yen",  /* yen sign = yuan sign, U+00A5 ISOnum */ },
205         { 166,  "brvbar",/* broken bar = broken vertical bar, U+00A6 ISOnum */ },
206         { 167,  "sect", /* section sign, U+00A7 ISOnum */ },
207         { 168,  "uml",  /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ },
208         { 169,  "copy", /* copyright sign, U+00A9 ISOnum */ },
209         { 170,  "ordf", /* feminine ordinal indicator, U+00AA ISOnum */ },
210         { 171,  "laquo",/* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ },
211         { 172,  "not",  /* not sign, U+00AC ISOnum */ },
212         { 173,  "shy",  /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ },
213         { 174,  "reg",  /* registered sign = registered trade mark sign, U+00AE ISOnum */ },
214         { 175,  "macr", /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ },
215         { 176,  "deg",  /* degree sign, U+00B0 ISOnum */ },
216         { 177,  "plusmn",/* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ },
217         { 178,  "sup2", /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ },
218         { 179,  "sup3", /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ },
219         { 180,  "acute",/* acute accent = spacing acute, U+00B4 ISOdia */ },
220         { 181,  "micro",/* micro sign, U+00B5 ISOnum */ },
221         { 182,  "para", /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ },
222         { 183,  "middot",/* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ },
223         { 184,  "cedil",/* cedilla = spacing cedilla, U+00B8 ISOdia */ },
224         { 185,  "sup1", /* superscript one = superscript digit one, U+00B9 ISOnum */ },
225         { 186,  "ordm", /* masculine ordinal indicator, U+00BA ISOnum */ },
226         { 187,  "raquo",/* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ },
227         { 188,  "frac14",/* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ },
228         { 189,  "frac12",/* vulgar fraction one half = fraction one half, U+00BD ISOnum */ },
229         { 190,  "frac34",/* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ },
230         { 191,  "iquest",/* inverted question mark = turned question mark, U+00BF ISOnum */ },
231         { 192,  "Agrave",/* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ },
232         { 193,  "Aacute",/* latin capital letter A with acute, U+00C1 ISOlat1 */ },
233         { 194,  "Acirc",/* latin capital letter A with circumflex, U+00C2 ISOlat1 */ },
234         { 195,  "Atilde",/* latin capital letter A with tilde, U+00C3 ISOlat1 */ },
235         { 196,  "Auml", /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ },
236         { 197,  "Aring",/* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ },
237         { 198,  "AElig",/* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ },
238         { 199,  "Ccedil",/* latin capital letter C with cedilla, U+00C7 ISOlat1 */ },
239         { 200,  "Egrave",/* latin capital letter E with grave, U+00C8 ISOlat1 */ },
240         { 201,  "Eacute",/* latin capital letter E with acute, U+00C9 ISOlat1 */ },
241         { 202,  "Ecirc",/* latin capital letter E with circumflex, U+00CA ISOlat1 */ },
242         { 203,  "Euml", /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ },
243         { 204,  "Igrave",/* latin capital letter I with grave, U+00CC ISOlat1 */ },
244         { 205,  "Iacute",/* latin capital letter I with acute, U+00CD ISOlat1 */ },
245         { 206,  "Icirc",/* latin capital letter I with circumflex, U+00CE ISOlat1 */ },
246         { 207,  "Iuml", /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ },
247         { 208,  "ETH",  /* latin capital letter ETH, U+00D0 ISOlat1 */ },
248         { 209,  "Ntilde",/* latin capital letter N with tilde, U+00D1 ISOlat1 */ },
249         { 210,  "Ograve",/* latin capital letter O with grave, U+00D2 ISOlat1 */ },
250         { 211,  "Oacute",/* latin capital letter O with acute, U+00D3 ISOlat1 */ },
251         { 212,  "Ocirc",/* latin capital letter O with circumflex, U+00D4 ISOlat1 */ },
252         { 213,  "Otilde",/* latin capital letter O with tilde, U+00D5 ISOlat1 */ },
253         { 214,  "Ouml", /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ },
254         { 215,  "times",/* multiplication sign, U+00D7 ISOnum */ },
255         { 216,  "Oslash",/* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ },
256         { 217,  "Ugrave",/* latin capital letter U with grave, U+00D9 ISOlat1 */ },
257         { 218,  "Uacute",/* latin capital letter U with acute, U+00DA ISOlat1 */ },
258         { 219,  "Ucirc",/* latin capital letter U with circumflex, U+00DB ISOlat1 */ },
259         { 220,  "Uuml", /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ },
260         { 221,  "Yacute",/* latin capital letter Y with acute, U+00DD ISOlat1 */ },
261         { 222,  "THORN",/* latin capital letter THORN, U+00DE ISOlat1 */ },
262         { 223,  "szlig",/* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ },
263         { 224,  "agrave",/* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ },
264         { 225,  "aacute",/* latin small letter a with acute, U+00E1 ISOlat1 */ },
265         { 226,  "acirc",/* latin small letter a with circumflex, U+00E2 ISOlat1 */ },
266         { 227,  "atilde",/* latin small letter a with tilde, U+00E3 ISOlat1 */ },
267         { 228,  "auml", /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ },
268         { 229,  "aring",/* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ },
269         { 230,  "aelig",/* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ },
270         { 231,  "ccedil",/* latin small letter c with cedilla, U+00E7 ISOlat1 */ },
271         { 232,  "egrave",/* latin small letter e with grave, U+00E8 ISOlat1 */ },
272         { 233,  "eacute",/* latin small letter e with acute, U+00E9 ISOlat1 */ },
273         { 234,  "ecirc",/* latin small letter e with circumflex, U+00EA ISOlat1 */ },
274         { 235,  "euml", /* latin small letter e with diaeresis, U+00EB ISOlat1 */ },
275         { 236,  "igrave",/* latin small letter i with grave, U+00EC ISOlat1 */ },
276         { 237,  "iacute",/* latin small letter i with acute, U+00ED ISOlat1 */ },
277         { 238,  "icirc",/* latin small letter i with circumflex, U+00EE ISOlat1 */ },
278         { 239,  "iuml", /* latin small letter i with diaeresis, U+00EF ISOlat1 */ },
279         { 240,  "eth",  /* latin small letter eth, U+00F0 ISOlat1 */ },
280         { 241,  "ntilde",/* latin small letter n with tilde, U+00F1 ISOlat1 */ },
281         { 242,  "ograve",/* latin small letter o with grave, U+00F2 ISOlat1 */ },
282         { 243,  "oacute",/* latin small letter o with acute, U+00F3 ISOlat1 */ },
283         { 244,  "ocirc",/* latin small letter o with circumflex, U+00F4 ISOlat1 */ },
284         { 245,  "otilde",/* latin small letter o with tilde, U+00F5 ISOlat1 */ },
285         { 246,  "ouml", /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ },
286         { 247,  "divide",/* division sign, U+00F7 ISOnum */ },
287         { 248,  "oslash",/* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ },
288         { 249,  "ugrave",/* latin small letter u with grave, U+00F9 ISOlat1 */ },
289         { 250,  "uacute",/* latin small letter u with acute, U+00FA ISOlat1 */ },
290         { 251,  "ucirc",/* latin small letter u with circumflex, U+00FB ISOlat1 */ },
291         { 252,  "uuml", /* latin small letter u with diaeresis, U+00FC ISOlat1 */ },
292         { 253,  "yacute",/* latin small letter y with acute, U+00FD ISOlat1 */ },
293         { 254,  "thorn",/* latin small letter thorn with, U+00FE ISOlat1 */ },
294         { 255,  "yuml", /* latin small letter y with diaeresis, U+00FF ISOlat1 */ },
295
296 /*
297  * Anything below should really be kept as entities references
298  */
299         { 402,  "fnof", /* latin small f with hook = function = florin, U+0192 ISOtech */ },
300
301         { 913,  "Alpha",/* greek capital letter alpha, U+0391 */ },
302         { 914,  "Beta", /* greek capital letter beta, U+0392 */ },
303         { 915,  "Gamma",/* greek capital letter gamma, U+0393 ISOgrk3 */ },
304         { 916,  "Delta",/* greek capital letter delta, U+0394 ISOgrk3 */ },
305         { 917,  "Epsilon",/* greek capital letter epsilon, U+0395 */ },
306         { 918,  "Zeta", /* greek capital letter zeta, U+0396 */ },
307         { 919,  "Eta",  /* greek capital letter eta, U+0397 */ },
308         { 920,  "Theta",/* greek capital letter theta, U+0398 ISOgrk3 */ },
309         { 921,  "Iota", /* greek capital letter iota, U+0399 */ },
310         { 922,  "Kappa",/* greek capital letter kappa, U+039A */ },
311         { 923,  "Lambda"/* greek capital letter lambda, U+039B ISOgrk3 */ },
312         { 924,  "Mu",   /* greek capital letter mu, U+039C */ },
313         { 925,  "Nu",   /* greek capital letter nu, U+039D */ },
314         { 926,  "Xi",   /* greek capital letter xi, U+039E ISOgrk3 */ },
315         { 927,  "Omicron",/* greek capital letter omicron, U+039F */ },
316         { 928,  "Pi",   /* greek capital letter pi, U+03A0 ISOgrk3 */ },
317         { 929,  "Rho",  /* greek capital letter rho, U+03A1 */ },
318         { 931,  "Sigma",/* greek capital letter sigma, U+03A3 ISOgrk3 */ },
319         { 932,  "Tau",  /* greek capital letter tau, U+03A4 */ },
320         { 933,  "Upsilon",/* greek capital letter upsilon, U+03A5 ISOgrk3 */ },
321         { 934,  "Phi",  /* greek capital letter phi, U+03A6 ISOgrk3 */ },
322         { 935,  "Chi",  /* greek capital letter chi, U+03A7 */ },
323         { 936,  "Psi",  /* greek capital letter psi, U+03A8 ISOgrk3 */ },
324         { 937,  "Omega",/* greek capital letter omega, U+03A9 ISOgrk3 */ },
325
326         { 945,  "alpha",/* greek small letter alpha, U+03B1 ISOgrk3 */ },
327         { 946,  "beta", /* greek small letter beta, U+03B2 ISOgrk3 */ },
328         { 947,  "gamma",/* greek small letter gamma, U+03B3 ISOgrk3 */ },
329         { 948,  "delta",/* greek small letter delta, U+03B4 ISOgrk3 */ },
330         { 949,  "epsilon",/* greek small letter epsilon, U+03B5 ISOgrk3 */ },
331         { 950,  "zeta", /* greek small letter zeta, U+03B6 ISOgrk3 */ },
332         { 951,  "eta",  /* greek small letter eta, U+03B7 ISOgrk3 */ },
333         { 952,  "theta",/* greek small letter theta, U+03B8 ISOgrk3 */ },
334         { 953,  "iota", /* greek small letter iota, U+03B9 ISOgrk3 */ },
335         { 954,  "kappa",/* greek small letter kappa, U+03BA ISOgrk3 */ },
336         { 955,  "lambda",/* greek small letter lambda, U+03BB ISOgrk3 */ },
337         { 956,  "mu",   /* greek small letter mu, U+03BC ISOgrk3 */ },
338         { 957,  "nu",   /* greek small letter nu, U+03BD ISOgrk3 */ },
339         { 958,  "xi",   /* greek small letter xi, U+03BE ISOgrk3 */ },
340         { 959,  "omicron",/* greek small letter omicron, U+03BF NEW */ },
341         { 960,  "pi",   /* greek small letter pi, U+03C0 ISOgrk3 */ },
342         { 961,  "rho",  /* greek small letter rho, U+03C1 ISOgrk3 */ },
343         { 962,  "sigmaf",/* greek small letter final sigma, U+03C2 ISOgrk3 */ },
344         { 963,  "sigma",/* greek small letter sigma, U+03C3 ISOgrk3 */ },
345         { 964,  "tau",  /* greek small letter tau, U+03C4 ISOgrk3 */ },
346         { 965,  "upsilon",/* greek small letter upsilon, U+03C5 ISOgrk3 */ },
347         { 966,  "phi",  /* greek small letter phi, U+03C6 ISOgrk3 */ },
348         { 967,  "chi",  /* greek small letter chi, U+03C7 ISOgrk3 */ },
349         { 968,  "psi",  /* greek small letter psi, U+03C8 ISOgrk3 */ },
350         { 969,  "omega",/* greek small letter omega, U+03C9 ISOgrk3 */ },
351         { 977,  "thetasym",/* greek small letter theta symbol, U+03D1 NEW */ },
352         { 978,  "upsih",/* greek upsilon with hook symbol, U+03D2 NEW */ },
353         { 982,  "piv",  /* greek pi symbol, U+03D6 ISOgrk3 */ },
354
355         { 8226, "bull", /* bullet = black small circle, U+2022 ISOpub */ },
356         { 8230, "hellip",/* horizontal ellipsis = three dot leader, U+2026 ISOpub */ },
357         { 8242, "prime",/* prime = minutes = feet, U+2032 ISOtech */ },
358         { 8243, "Prime",/* double prime = seconds = inches, U+2033 ISOtech */ },
359         { 8254, "oline",/* overline = spacing overscore, U+203E NEW */ },
360         { 8260, "frasl",/* fraction slash, U+2044 NEW */ },
361
362         { 8472, "weierp",/* script capital P = power set = Weierstrass p, U+2118 ISOamso */ },
363         { 8465, "image",/* blackletter capital I = imaginary part, U+2111 ISOamso */ },
364         { 8476, "real", /* blackletter capital R = real part symbol, U+211C ISOamso */ },
365         { 8482, "trade",/* trade mark sign, U+2122 ISOnum */ },
366         { 8501, "alefsym",/* alef symbol = first transfinite cardinal, U+2135 NEW */ },
367         { 8592, "larr", /* leftwards arrow, U+2190 ISOnum */ },
368         { 8593, "uarr", /* upwards arrow, U+2191 ISOnum */ },
369         { 8594, "rarr", /* rightwards arrow, U+2192 ISOnum */ },
370         { 8595, "darr", /* downwards arrow, U+2193 ISOnum */ },
371         { 8596, "harr", /* left right arrow, U+2194 ISOamsa */ },
372         { 8629, "crarr",/* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ },
373         { 8656, "lArr", /* leftwards double arrow, U+21D0 ISOtech */ },
374         { 8657, "uArr", /* upwards double arrow, U+21D1 ISOamsa */ },
375         { 8658, "rArr", /* rightwards double arrow, U+21D2 ISOtech */ },
376         { 8659, "dArr", /* downwards double arrow, U+21D3 ISOamsa */ },
377         { 8660, "hArr", /* left right double arrow, U+21D4 ISOamsa */ },
378
379
380         { 8704, "forall",/* for all, U+2200 ISOtech */ },
381         { 8706, "part", /* partial differential, U+2202 ISOtech */ },
382         { 8707, "exist",/* there exists, U+2203 ISOtech */ },
383         { 8709, "empty",/* empty set = null set = diameter, U+2205 ISOamso */ },
384         { 8711, "nabla",/* nabla = backward difference, U+2207 ISOtech */ },
385         { 8712, "isin", /* element of, U+2208 ISOtech */ },
386         { 8713, "notin",/* not an element of, U+2209 ISOtech */ },
387         { 8715, "ni",   /* contains as member, U+220B ISOtech */ },
388         { 8719, "prod", /* n-ary product = product sign, U+220F ISOamsb */ },
389         { 8721, "sum",  /* n-ary sumation, U+2211 ISOamsb */ },
390         { 8722, "minus",/* minus sign, U+2212 ISOtech */ },
391         { 8727, "lowast",/* asterisk operator, U+2217 ISOtech */ },
392         { 8730, "radic",/* square root = radical sign, U+221A ISOtech */ },
393         { 8733, "prop", /* proportional to, U+221D ISOtech */ },
394         { 8734, "infin",/* infinity, U+221E ISOtech */ },
395         { 8736, "ang",  /* angle, U+2220 ISOamso */ },
396         { 8743, "and",  /* logical and = wedge, U+2227 ISOtech */ },
397         { 8744, "or",   /* logical or = vee, U+2228 ISOtech */ },
398         { 8745, "cap",  /* intersection = cap, U+2229 ISOtech */ },
399         { 8746, "cup",  /* union = cup, U+222A ISOtech */ },
400         { 8747, "int",  /* integral, U+222B ISOtech */ },
401         { 8756, "there4",/* therefore, U+2234 ISOtech */ },
402         { 8764, "sim",  /* tilde operator = varies with = similar to, U+223C ISOtech */ },
403         { 8773, "cong", /* approximately equal to, U+2245 ISOtech */ },
404         { 8776, "asymp",/* almost equal to = asymptotic to, U+2248 ISOamsr */ },
405         { 8800, "ne",   /* not equal to, U+2260 ISOtech */ },
406         { 8801, "equiv",/* identical to, U+2261 ISOtech */ },
407         { 8804, "le",   /* less-than or equal to, U+2264 ISOtech */ },
408         { 8805, "ge",   /* greater-than or equal to, U+2265 ISOtech */ },
409         { 8834, "sub",  /* subset of, U+2282 ISOtech */ },
410         { 8835, "sup",  /* superset of, U+2283 ISOtech */ },
411         { 8836, "nsub", /* not a subset of, U+2284 ISOamsn */ },
412         { 8838, "sube", /* subset of or equal to, U+2286 ISOtech */ },
413         { 8839, "supe", /* superset of or equal to, U+2287 ISOtech */ },
414         { 8853, "oplus",/* circled plus = direct sum, U+2295 ISOamsb */ },
415         { 8855, "otimes",/* circled times = vector product, U+2297 ISOamsb */ },
416         { 8869, "perp", /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ },
417         { 8901, "sdot", /* dot operator, U+22C5 ISOamsb */ },
418         { 8968, "lceil",/* left ceiling = apl upstile, U+2308 ISOamsc */ },
419         { 8969, "rceil",/* right ceiling, U+2309 ISOamsc */ },
420         { 8970, "lfloor",/* left floor = apl downstile, U+230A ISOamsc */ },
421         { 8971, "rfloor",/* right floor, U+230B ISOamsc */ },
422         { 9001, "lang", /* left-pointing angle bracket = bra, U+2329 ISOtech */ },
423         { 9002, "rang", /* right-pointing angle bracket = ket, U+232A ISOtech */ },
424         { 9674, "loz",  /* lozenge, U+25CA ISOpub */ },
425
426         { 9824, "spades",/* black spade suit, U+2660 ISOpub */ },
427         { 9827, "clubs",/* black club suit = shamrock, U+2663 ISOpub */ },
428         { 9829, "hearts",/* black heart suit = valentine, U+2665 ISOpub */ },
429         { 9830, "diams",/* black diamond suit, U+2666 ISOpub */ },
430
431         { 338,  "OElig",/* latin capital ligature OE, U+0152 ISOlat2 */ },
432         { 339,  "oelig",/* latin small ligature oe, U+0153 ISOlat2 */ },
433         { 352,  "Scaron",/* latin capital letter S with caron, U+0160 ISOlat2 */ },
434         { 353,  "scaron",/* latin small letter s with caron, U+0161 ISOlat2 */ },
435         { 376,  "Yuml", /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ },
436         { 710,  "circ", /* modifier letter circumflex accent, U+02C6 ISOpub */ },
437         { 732,  "tilde",/* small tilde, U+02DC ISOdia */ },
438
439         { 8194, "ensp", /* en space, U+2002 ISOpub */ },
440         { 8195, "emsp", /* em space, U+2003 ISOpub */ },
441         { 8201, "thinsp",/* thin space, U+2009 ISOpub */ },
442         { 8204, "zwnj", /* zero width non-joiner, U+200C NEW RFC 2070 */ },
443         { 8205, "zwj",  /* zero width joiner, U+200D NEW RFC 2070 */ },
444         { 8206, "lrm",  /* left-to-right mark, U+200E NEW RFC 2070 */ },
445         { 8207, "rlm",  /* right-to-left mark, U+200F NEW RFC 2070 */ },
446         { 8211, "ndash",/* en dash, U+2013 ISOpub */ },
447         { 8212, "mdash",/* em dash, U+2014 ISOpub */ },
448         { 8216, "lsquo",/* left single quotation mark, U+2018 ISOnum */ },
449         { 8217, "rsquo",/* right single quotation mark, U+2019 ISOnum */ },
450         { 8218, "sbquo",/* single low-9 quotation mark, U+201A NEW */ },
451         { 8220, "ldquo",/* left double quotation mark, U+201C ISOnum */ },
452         { 8221, "rdquo",/* right double quotation mark, U+201D ISOnum */ },
453         { 8222, "bdquo",/* double low-9 quotation mark, U+201E NEW */ },
454         { 8224, "dagger",/* dagger, U+2020 ISOpub */ },
455         { 8225, "Dagger",/* double dagger, U+2021 ISOpub */ },
456         { 8240, "permil",/* per mille sign, U+2030 ISOtech */ },
457         { 8249, "lsaquo",/* single left-pointing angle quotation mark, U+2039 ISO proposed */ },
458         { 8250, "rsaquo",/* single right-pointing angle quotation mark, U+203A ISO proposed */ },
459         { 8364, "euro", /* euro sign, U+20AC NEW */ }
460 };
461
462 static GHashTable *entities;
463
464 /* this cannot be called in a thread context */
465 static void tokenise_setup(void)
466 {
467         int i;
468
469         if (entities == NULL) {
470                 entities = g_hash_table_new(g_str_hash, g_str_equal);
471                 for (i=0;i<sizeof(entity_map)/sizeof(entity_map[0]);i++) {
472                         g_hash_table_insert(entities, (char *)entity_map[i].name, GUINT_TO_POINTER(entity_map[i].val));
473                 }
474         }
475 }
476
477 static CamelHTMLParserPrivate *tokenise_init(void)
478 {
479         CamelHTMLParserPrivate *p;
480
481         p = g_malloc(sizeof(*p));
482         p->state = CAMEL_HTML_PARSER_DATA;
483
484         p->attr = 0;
485         p->attrs = g_ptr_array_new();
486         p->values = g_ptr_array_new();
487         p->tag = g_string_new("");
488         p->ent = g_string_new("");
489         p->charset = NULL;
490         
491         if (entities == NULL)
492                 tokenise_setup();
493
494         return p;
495 }
496
497 static void tokenise_free(CamelHTMLParserPrivate *p)
498 {
499         int i;
500
501         g_string_free(p->tag, TRUE);
502         g_string_free(p->ent, TRUE);
503         g_free(p->charset);
504
505         for (i=0;i<p->attrs->len;i++)
506                 g_string_free(p->attrs->pdata[i], TRUE);
507
508         for (i=0;i<p->values->len;i++)
509                 g_string_free(p->values->pdata[i], TRUE);
510
511         g_free(p);
512 }
513
514 static int convert_entity(const char *e, char *ent)
515 {
516         unsigned int val;
517
518         if (e[0] == '#')
519                 return g_unichar_to_utf8(atoi(e+1), ent);
520
521         val = GPOINTER_TO_UINT(g_hash_table_lookup(entities, e));
522         if (ent)
523                 return g_unichar_to_utf8(val, ent);
524         else
525                 return 0;
526 }
527
528 #if 0
529 static void dump_tag(CamelHTMLParserPrivate *p)
530 {
531         int i;
532
533         printf("got tag: %s\n", p->tag->str);
534         printf("%d attributes:\n", p->attr);
535         for (i=0;i<p->attr;i++) {
536                 printf(" %s = '%s'\n", ((GString *)p->attrs->pdata[i])->str, ((GString *)p->values->pdata[i])->str);
537         }
538 }
539 #endif
540
541 static int tokenise_step(CamelHTMLParserPrivate *p, char **datap, int *lenp)
542 {
543         char *in = p->inptr;
544         char *inend = p->inend;
545         char c;
546         int state = p->state, ret, len;
547         char *start = p->inptr;
548
549         d(printf("Tokenise step\n"));
550
551         while (in < inend) {
552                 c = *in++;
553                 switch (state) {
554                 case CAMEL_HTML_PARSER_DATA:
555                         if (c == '<') {
556                                 ret = state;
557                                 state = CAMEL_HTML_PARSER_TAG;
558                                 p->attr = 0;
559                                 g_string_truncate(p->tag, 0);
560                                 d(printf("got data '%.*s'\n", in-start-1, start));
561                                 *datap = start;
562                                 *lenp = in-start-1;
563                                 goto done;
564                         } else if (c=='&') {
565                                 ret = state;
566                                 state = CAMEL_HTML_PARSER_ENT;
567                                 g_string_truncate(p->ent, 0);
568                                 g_string_append_c(p->ent, c);
569                                 d(printf("got data '%.*s'\n", in-start-1, start));
570                                 *datap = start;
571                                 *lenp = in-start-1;
572                                 goto done;
573                         }
574                         break;
575                 case CAMEL_HTML_PARSER_ENT:
576                         if (c==';') {
577                                 len = convert_entity(p->ent->str+1, p->ent_utf8);
578                                 if (len == 0) {
579                                         /* handle broken entity */
580                                         g_string_append_c(p->ent, c);
581                                         ret = state = CAMEL_HTML_PARSER_DATA;
582                                         *datap = p->ent->str;
583                                         *lenp = p->ent->len;
584                                         goto done;
585                                 } else {
586                                         d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
587                                         ret = state;
588                                         state = CAMEL_HTML_PARSER_DATA;
589                                         *datap = p->ent_utf8;
590                                         *lenp = len;
591                                         goto done;
592                                 }
593                         } else if (isalnum(c) || c=='#') { /* FIXME: right type */
594                                 g_string_append_c(p->ent, c);
595                         } else {
596                                 /* handle broken entity */
597                                 g_string_append_c(p->ent, c);
598                                 ret = state = CAMEL_HTML_PARSER_DATA;
599                                 *datap = p->ent->str;
600                                 *lenp = p->ent->len;
601                                 goto done;
602                         }
603                         break;
604                 case CAMEL_HTML_PARSER_TAG:
605                         if (c == '!') {
606                                 state = CAMEL_HTML_PARSER_COMMENT0;
607                                 g_string_append_c(p->tag, c);
608                         } else if (c == '>') {
609                                 d(dump_tag(p));
610                                 ret = CAMEL_HTML_PARSER_ELEMENT;
611                                 state = CAMEL_HTML_PARSER_DATA;
612                                 goto done;
613                         } else if (c == ' ' || c=='\n' || c=='\t') {
614                                 state = CAMEL_HTML_PARSER_ATTR0;
615                         } else {
616                                 g_string_append_c(p->tag, c);
617                         }
618                         break;
619                         /* check for <!-- */
620                 case CAMEL_HTML_PARSER_COMMENT0:
621                         if (c == '-') {
622                                 g_string_append_c(p->tag, c);
623                                 if (p->tag->len == 3) {
624                                         g_string_truncate(p->tag, 0);
625                                         state = CAMEL_HTML_PARSER_COMMENT;
626                                 }
627                         } else {
628                                 /* got something else, probbly dtd entity */
629                                 state = CAMEL_HTML_PARSER_DTDENT;
630                         }
631                         break;
632                 case CAMEL_HTML_PARSER_DTDENT:
633                         if (c == '>') {
634                                 ret = CAMEL_HTML_PARSER_DTDENT;
635                                 state = CAMEL_HTML_PARSER_DATA;
636                                 *datap = start;
637                                 *lenp = in-start-1;
638                                 goto done;
639                         }
640                         break;
641                 case CAMEL_HTML_PARSER_COMMENT:
642                         if (c == '>' && p->tag->len == 2) {
643                                 ret = CAMEL_HTML_PARSER_COMMENT;
644                                 state = CAMEL_HTML_PARSER_DATA;
645                                 *datap = start;
646                                 *lenp = in-start-1;
647                                 goto done;
648                         } else if (c=='-') {
649                                 /* we dont care if we get 'n' --'s before the > */
650                                 if (p->tag->len < 2)
651                                         g_string_append_c(p->tag, c);
652                         } else {
653                                 g_string_truncate(p->tag, 0);
654                         }
655                         break;
656                 case CAMEL_HTML_PARSER_ATTR0:   /* pre-attribute whitespace */
657                         if (c == '>') {
658                                 d(dump_tag(p));
659                                 ret = CAMEL_HTML_PARSER_ELEMENT;
660                                 state = CAMEL_HTML_PARSER_DATA;
661                                 goto done;
662                         } else if (c == ' ' || c=='\n' || c=='\t') {
663                         } else {
664                                 if (p->attrs->len <= p->attr) {
665                                         g_ptr_array_add(p->attrs, g_string_new(""));
666                                         g_ptr_array_add(p->values, g_string_new(""));
667                                 } else {
668                                         g_string_truncate(p->attrs->pdata[p->attr], 0);
669                                         g_string_truncate(p->values->pdata[p->attr], 0);
670                                 }
671                                 g_string_append_c(p->attrs->pdata[p->attr], c);
672                                 state = CAMEL_HTML_PARSER_ATTR;
673                         }
674                         break;
675                 case CAMEL_HTML_PARSER_ATTR:
676                         if (c == '>') {
677                                 d(dump_tag(p));
678                                 ret = CAMEL_HTML_PARSER_ELEMENT;
679                                 state = CAMEL_HTML_PARSER_DATA;
680                                 goto done;
681                         } else if (c == '=') {
682                                 state = CAMEL_HTML_PARSER_VAL0;
683                         } else if (c == ' ' || c=='\n' || c=='\t') {
684                                 state = CAMEL_HTML_PARSER_ATTR0;
685                                 p->attr++;
686                         } else {
687                                 g_string_append_c(p->attrs->pdata[p->attr], c);
688                         }
689                         break;
690                 case CAMEL_HTML_PARSER_VAL0:
691                         if (c == '>') {
692                                 d(printf("value truncated\n"));
693                                 d(dump_tag(p));
694                                 ret = CAMEL_HTML_PARSER_ELEMENT;
695                                 state = CAMEL_HTML_PARSER_DATA;
696                                 goto done;
697                         } else if (c == '\'' || c == '\"') {
698                                 p->quote = c;
699                                 state = CAMEL_HTML_PARSER_VAL;
700                         } else if (c == ' ' || c=='\n' || c=='\t') {
701                         } else {
702                                 g_string_append_c(p->values->pdata[p->attr], c);
703                                 p->quote = 0;
704                                 state = CAMEL_HTML_PARSER_VAL;
705                         }
706                         break;
707                 case CAMEL_HTML_PARSER_VAL:
708                 do_val:
709                         if (p->quote) {
710                                 if (c == '>') {
711                                         d(printf("value truncated\n"));
712                                         d(dump_tag(p));
713                                         ret = CAMEL_HTML_PARSER_ELEMENT;
714                                         state = CAMEL_HTML_PARSER_DATA;
715                                         p->attr++;
716                                         goto done;
717                                 } else if (c == p->quote) {
718                                         state = CAMEL_HTML_PARSER_ATTR0;
719                                         p->attr++;
720                                 } else if (c=='&') {
721                                         state = CAMEL_HTML_PARSER_VAL_ENT;
722                                         g_string_truncate(p->ent, 0);
723                                 } else {
724                                         g_string_append_c(p->values->pdata[p->attr], c);
725                                 }
726                         } else if (c == '>') {
727                                 d(dump_tag(p));
728                                 ret = CAMEL_HTML_PARSER_ELEMENT;
729                                 state = CAMEL_HTML_PARSER_DATA;
730                                 p->attr++;
731                                 goto done;
732                         } else if (c == ' ' || c=='\n' || c=='\t') {
733                                 state = CAMEL_HTML_PARSER_ATTR0;
734                                 p->attr++;
735                         } else if (c=='&') {
736                                 state = CAMEL_HTML_PARSER_VAL_ENT;
737                                 g_string_truncate(p->ent, 0);
738                         } else {
739                                 g_string_append_c(p->values->pdata[p->attr], c);
740                         }
741                         break;
742                 case CAMEL_HTML_PARSER_VAL_ENT:
743                         if (c==';') {
744                                 state = CAMEL_HTML_PARSER_VAL;
745                                 len = convert_entity(p->ent->str+1, p->ent_utf8);
746                                 if (len == 0) {
747                                         /* fallback; broken entity, just output it and see why we ended */
748                                         g_string_append(p->values->pdata[p->attr], p->ent->str);
749                                         g_string_append_c(p->values->pdata[p->attr], ';');
750                                 } else {
751                                         d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
752                                         g_string_append_len(p->values->pdata[p->attr], p->ent_utf8, len);
753                                 }
754                         } else if (isalnum(c) || c=='#') { /* FIXME: right type */
755                                 g_string_append_c(p->ent, c);
756                         } else {
757                                 /* fallback; broken entity, just output it and see why we ended */
758                                 g_string_append(p->values->pdata[p->attr], p->ent->str);
759                                 goto do_val;
760                         }
761                         break;
762                 }
763         }
764
765         if (p->eof) {
766                 /* FIXME: what about other truncated states? */
767                 switch (state) {
768                 case CAMEL_HTML_PARSER_DATA:
769                 case CAMEL_HTML_PARSER_COMMENT:
770                         if (in > start) {
771                                 ret = state;
772                                 *datap = start;
773                                 *lenp = in-start-1;
774                         } else {
775                                 ret = CAMEL_HTML_PARSER_EOF;
776                                 state = CAMEL_HTML_PARSER_EOF;
777                         }
778                         break;
779                 default:
780                         ret = CAMEL_HTML_PARSER_EOF;
781                         state = CAMEL_HTML_PARSER_EOF;
782                 }
783         } else {
784                 /* we only care about remaining data for this buffer, everything else has its own copy */
785                 switch (state) {
786                 case CAMEL_HTML_PARSER_DATA:
787                 case CAMEL_HTML_PARSER_COMMENT:
788                         if (in > start) {
789                                 ret = state;
790                                 *datap = start;
791                                 *lenp = in-start-1;
792                         } else {
793                                 ret = CAMEL_HTML_PARSER_EOD;
794                         }
795                         break;
796                 default:
797                         ret = CAMEL_HTML_PARSER_EOD;
798                 }
799         }
800
801 done:
802         p->start = start;
803         p->state = state;
804         p->inptr = in;
805
806         return ret;
807 }