1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Authors: Jeffrey Stedfast <fejj@ximian.com>
5 * Copyright 2002 Ximian, Inc. (www.ximian.com)
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
32 #include <libedataserver/e-trie.h>
34 #include "camel-url-scanner.h"
35 #include "camel-utf8.h"
37 struct _CamelUrlScanner {
44 camel_url_scanner_new (void)
46 CamelUrlScanner *scanner;
48 scanner = g_new (CamelUrlScanner, 1);
49 scanner->patterns = g_ptr_array_new ();
50 scanner->trie = e_trie_new (TRUE);
57 camel_url_scanner_free (CamelUrlScanner *scanner)
59 g_return_if_fail (scanner != NULL);
61 g_ptr_array_free (scanner->patterns, TRUE);
62 e_trie_free (scanner->trie);
68 camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern)
70 g_return_if_fail (scanner != NULL);
72 e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
73 g_ptr_array_add (scanner->patterns, pattern);
78 camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
80 const char *pos, *inptr, *inend;
84 g_return_val_if_fail (scanner != NULL, FALSE);
85 g_return_val_if_fail (in != NULL, FALSE);
91 if (!(pos = e_trie_search (scanner->trie, inptr, inlen, &pattern)))
94 pat = g_ptr_array_index (scanner->patterns, pattern);
96 match->pattern = pat->pattern;
97 match->prefix = pat->prefix;
99 if (pat->start (in, pos, inend, match) && pat->end (in, pos, inend, match))
103 if (camel_utf8_getc_limit (&inptr, (const unsigned char *) inend) == 0xffff)
106 inlen = inend - inptr;
107 } while (inptr < inend);
113 static unsigned char url_scanner_table[256] = {
114 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
116 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
117 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
118 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
119 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
120 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
121 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
138 IS_SPECIAL = (1 << 5),
139 IS_DOMAIN = (1 << 6),
140 IS_URLSAFE = (1 << 7),
143 #define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
144 #define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
145 #define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
146 #define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
147 #define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
148 #define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0)
149 #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
152 static const struct {
165 is_open_brace (char c)
169 for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
170 if (c == url_braces[i].open)
178 url_stop_at_brace (const char *in, size_t so, char *open_brace)
182 if (open_brace != NULL)
186 for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
187 if (in[so - 1] == url_braces[i].open) {
188 if (open_brace != NULL)
189 *open_brace = url_braces[i].open;
190 return url_braces[i].close;
200 camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
202 register const char *inptr = pos;
204 g_assert (*inptr == '@');
210 if (is_atom (*inptr))
215 while (inptr > in && is_atom (*inptr))
218 if (inptr > in && *inptr == '.')
222 while (!is_atom (*inptr) || is_open_brace (*inptr))
228 match->um_so = (inptr - in);
234 camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
236 const char *inptr = pos;
237 int parts = 0, digits;
238 gboolean got_dot = FALSE;
240 g_assert (*inptr == '@');
250 while (inptr < inend && is_digit (*inptr) && digits < 3) {
257 if (*inptr != '.' && parts != 4)
268 while (inptr < inend) {
269 if (is_domain (*inptr))
274 while (inptr < inend && is_domain (*inptr))
277 if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
285 /* don't allow toplevel domains */
286 if (inptr == pos + 1 || !got_dot)
289 match->um_eo = (inptr - in);
295 camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
297 match->um_so = (pos - in);
303 camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
305 register const char *inptr = pos;
308 inptr += strlen (match->pattern);
313 close_brace = url_stop_at_brace (in, match->um_so, NULL);
315 while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
321 match->um_eo = (inptr - in);
327 camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
329 if (pos > in && !strncmp (pos, "www", 3)) {
330 /* make sure we aren't actually part of another word */
331 if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
335 match->um_so = (pos - in);
341 camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
343 register const char *inptr = pos;
344 gboolean passwd = FALSE;
346 char close_brace, open_brace;
350 inptr += strlen (match->pattern);
352 close_brace = url_stop_at_brace (in, match->um_so, &open_brace);
354 /* find the end of the domain */
355 if (is_atom (*inptr)) {
356 /* might be a domain or user@domain */
358 while (inptr < inend) {
359 if (!is_atom (*inptr))
364 while (inptr < inend && is_atom (*inptr))
367 if ((inptr + 1) < inend && *inptr == '.' && (is_atom (inptr[1]) || inptr[1] == '/'))
377 } else if (is_domain (*inptr)) {
379 while (inptr < inend) {
380 if (!is_domain (*inptr))
385 while (inptr < inend && is_domain (*inptr))
388 if ((inptr + 1) < inend && *inptr == '.' && (is_domain (inptr[1]) || inptr[1] == '/'))
397 case ':': /* we either have a port or a password */
400 if (is_digit (*inptr) || passwd) {
401 port = (*inptr++ - '0');
403 while (inptr < inend && is_digit (*inptr) && port < 65536)
404 port = (port * 10) + (*inptr++ - '0');
406 if (!passwd && (port >= 65536 || *inptr == '@')) {
408 /* this must be a password? */
419 while (inptr < inend && is_atom (*inptr))
422 if ((inptr + 2) < inend) {
425 if (is_domain (*inptr))
433 if (inptr >= inend || *inptr != '/')
436 /* we have a '/' so there could be a path - fall through */
437 case '/': /* we've detected a path component to our url */
440 while (inptr < inend && is_urlsafe (*inptr)) {
441 if (*inptr == open_brace) {
444 } else if (*inptr == close_brace) {
447 if (brace_stack == -1)
459 /* urls are extremely unlikely to end with any
460 * punctuation, so strip any trailing
461 * punctuation off. Also strip off any closing
463 while (inptr > pos && strchr (",.:;?!-|}]\"", inptr[-1]))
467 match->um_eo = (inptr - in);
478 /* got these from rfc1738 */
479 #define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
480 #define CHARS_SPECIAL "()<>@,;:\\\".[]"
482 /* got these from rfc1738 */
483 #define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
487 table_init_bits (unsigned int mask, const unsigned char *vals)
491 for (i = 0; vals[i] != '\0'; i++)
492 url_scanner_table[vals[i]] |= mask;
496 url_scanner_table_init (void)
500 for (i = 0; i < 256; i++) {
501 url_scanner_table[i] = 0;
503 url_scanner_table[i] |= IS_CTRL;
504 if ((i >= '0' && i <= '9'))
505 url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
506 if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
507 url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
509 url_scanner_table[i] |= IS_CTRL;
512 url_scanner_table[' '] |= IS_SPACE;
513 url_scanner_table['-'] |= IS_DOMAIN;
515 /* not defined to be special in rfc0822, but when scanning
516 backwards to find the beginning of the email address we do
517 not want to include this char if we come accross it - so
518 this is kind of a hack */
519 url_scanner_table['/'] |= IS_SPECIAL;
521 table_init_bits (IS_LWSP, CHARS_LWSP);
522 table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
523 table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
526 int main (int argc, char **argv)
530 url_scanner_table_init ();
532 printf ("static unsigned char url_scanner_table[256] = {");
533 for (i = 0; i < 256; i++) {
534 printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
535 url_scanner_table[i], i != 255 ? "," : "\n");
542 #endif /* BUILD_TABLE */