1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2000-2012 Jeffrey Stedfast
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
31 #include "url-scanner.h"
41 url_scanner_new (void)
45 scanner = g_new (UrlScanner, 1);
46 scanner->patterns = g_ptr_array_new ();
47 scanner->trie = g_trie_new (TRUE);
54 url_scanner_free (UrlScanner *scanner)
56 g_return_if_fail (scanner != NULL);
58 g_ptr_array_free (scanner->patterns, TRUE);
59 g_trie_free (scanner->trie);
65 url_scanner_add (UrlScanner *scanner, urlpattern_t *pattern)
67 g_return_if_fail (scanner != NULL);
69 g_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
70 g_ptr_array_add (scanner->patterns, pattern);
75 url_scanner_scan (UrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
77 const char *pos, *inend;
81 g_return_val_if_fail (scanner != NULL, FALSE);
82 g_return_val_if_fail (in != NULL, FALSE);
84 if (!(pos = g_trie_search (scanner->trie, in, inlen, &pattern_id)))
87 pat = g_ptr_array_index (scanner->patterns, pattern_id);
89 match->pattern = pat->pattern;
90 match->prefix = pat->prefix;
93 if (!pat->start (in, pos, inend, match))
96 if (!pat->end (in, pos, inend, match))
103 static unsigned char url_scanner_table[256] = {
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
107 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
108 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
109 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
110 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
111 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
114 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
128 IS_SPECIAL = (1 << 5),
129 IS_DOMAIN = (1 << 6),
130 IS_URLSAFE = (1 << 7),
133 #define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
134 #define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
135 #define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
136 #define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
137 #define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
138 #define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0)
139 #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
153 is_open_brace (char c)
157 for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
158 if (c == url_braces[i].open)
166 url_stop_at_brace (const char *in, size_t so)
171 for (i = 0; i < 4; i++) {
172 if (in[so - 1] == url_braces[i].open)
173 return url_braces[i].close;
182 url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
184 register const char *inptr = pos;
186 g_assert (*inptr == '@');
194 if (is_atom (*inptr))
199 while (inptr > in && is_atom (*inptr))
202 if (inptr > in && *inptr == '.')
206 if (!is_atom (*inptr) || is_open_brace (*inptr))
212 match->um_so = (inptr - in);
218 url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
220 const char *inptr = pos;
221 int parts = 0, digits;
222 gboolean got_dot = FALSE;
224 g_assert (*inptr == '@');
234 while (inptr < inend && is_digit (*inptr) && digits < 3) {
241 if (*inptr != '.' && parts != 4)
245 if (inptr < inend && *inptr == ']')
252 while (inptr < inend) {
253 if (is_domain (*inptr))
258 while (inptr < inend && is_domain (*inptr))
261 if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
269 if (inptr == pos + 1 || !got_dot)
272 match->um_eo = (inptr - in);
279 url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
281 match->um_so = (pos - in);
287 url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
289 register const char *inptr = pos;
292 inptr += strlen (match->pattern);
297 close_brace = url_stop_at_brace (in, match->um_so);
299 while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
305 match->um_eo = (inptr - in);
311 url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
313 match->um_so = (pos - in);
319 url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
321 register const char *inptr = pos;
322 gboolean openbracket = FALSE;
323 gboolean passwd = FALSE;
329 inptr += strlen (match->pattern);
331 close_brace = url_stop_at_brace (in, match->um_so);
333 /* find the end of the domain */
334 if (is_digit (*inptr)) {
336 } else if (is_atom (*inptr)) {
337 /* might be a domain or user@domain */
339 while (inptr < inend) {
340 if (!is_atom (*inptr))
345 while (inptr < inend && is_atom (*inptr))
348 if ((inptr + 1) < inend && *inptr == '.' && is_atom (inptr[1]))
358 /* IPv6 (or possibly IPv4) address literal */
362 if (is_domain (*inptr)) {
363 /* domain name or IPv4 address */
366 } else if (*inptr == '[') {
371 if (is_digit (*inptr)) {
373 /* could be IPv4 or IPv6 */
374 if ((val = strtol (inptr, &end, 10)) < 0)
376 } else if ((*inptr >= 'A' && *inptr <= 'F') || (*inptr >= 'a' && *inptr <= 'f')) {
377 /* IPv6 address literals are in hex */
378 if ((val = strtol (inptr, &end, 16)) < 0 || *end != ':')
380 } else if (*inptr == ':') {
381 /* IPv6 can start with a ':' */
382 end = (char *) inptr;
383 val = 256; /* invalid value */
389 case '.': /* IPv4 address literal */
392 if (val > 255 || *end != '.')
396 if ((val = strtol (inptr, &end, 10)) < 0)
402 if (val > 255 || n < 4 || (openbracket && *end != ']'))
407 case ':': /* IPv6 address literal */
414 if ((val = strtol (inptr, &end, 16)) < 0)
420 } while (end > inptr && *end == ':');
430 } else if (is_domain (*inptr)) {
432 while (inptr < inend) {
433 if (!is_domain (*inptr))
438 while (inptr < inend && is_domain (*inptr))
441 if ((inptr + 1) < inend && *inptr == '.' && (is_domain (inptr[1]) || inptr[1] == '/'))
450 case ':': /* we either have a port or a password */
453 if (is_digit (*inptr) || passwd) {
454 port = (*inptr++ - '0');
456 while (inptr < inend && is_digit (*inptr) && port < 65536)
457 port = (port * 10) + (*inptr++ - '0');
459 if (!passwd && (port >= 65536 || *inptr == '@')) {
461 /* this must be a password? */
472 while (inptr < inend && is_atom (*inptr))
475 if ((inptr + 2) < inend) {
478 if (is_domain (*inptr))
486 if (inptr >= inend || *inptr != '/')
489 /* we have a '/' so there could be a path - fall through */
490 case '/': /* we've detected a path component to our url */
493 while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
502 /* urls are extremely unlikely to end with any
503 * punctuation, so strip any trailing
504 * punctuation off. Also strip off any closing
505 * braces or quotes. */
506 while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1]))
509 match->um_eo = (inptr - in);
519 /* got these from rfc1738 */
520 #define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
521 #define CHARS_SPECIAL "()<>@,;:\\\".[]"
523 /* got these from rfc1738 */
524 #define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
528 table_init_bits (unsigned int mask, const unsigned char *vals)
532 for (i = 0; vals[i] != '\0'; i++)
533 url_scanner_table[vals[i]] |= mask;
537 url_scanner_table_init (void)
541 for (i = 0; i < 256; i++) {
542 url_scanner_table[i] = 0;
544 url_scanner_table[i] |= IS_CTRL;
545 if ((i >= '0' && i <= '9'))
546 url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
547 if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
548 url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
550 url_scanner_table[i] |= IS_CTRL;
553 url_scanner_table[' '] |= IS_SPACE;
554 url_scanner_table['-'] |= IS_DOMAIN;
556 /* not defined to be special in rfc0822, but when scanning
557 backwards to find the beginning of the email address we do
558 not want to include this char if we come accross it - so
559 this is kind of a hack, but it's ok */
560 url_scanner_table['/'] |= IS_SPECIAL;
562 table_init_bits (IS_LWSP, CHARS_LWSP);
563 table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
564 table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
567 int main (int argc, char **argv)
571 url_scanner_table_init ();
573 printf ("static unsigned char url_scanner_table[256] = {");
574 for (i = 0; i < 256; i++) {
575 printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
576 url_scanner_table[i], i != 255 ? "," : "\n");
583 #endif /* BUILD_TABLE */