camel/camel-url-scanner.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Authors: Jeffrey Stedfast <fejj@ximian.com>
   4  *
   5  *  Copyright 2002 Ximian, Inc. (www.ximian.com)
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU Lesser General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU Lesser General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Lesser General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <ctype.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31
  32 #include <libedataserver/e-trie.h>
  33
  34 #include "camel-url-scanner.h"
  35 #include "camel-utf8.h"
  36
  37 struct _CamelUrlScanner {
  38         GPtrArray *patterns;
  39         ETrie *trie;
  40 };
  41
  42
  43 CamelUrlScanner *
  44 camel_url_scanner_new (void)
  45 {
  46         CamelUrlScanner *scanner;
  47
  48         scanner = g_new (CamelUrlScanner, 1);
  49         scanner->patterns = g_ptr_array_new ();
  50         scanner->trie = e_trie_new (TRUE);
  51
  52         return scanner;
  53 }
  54
  55
  56 void
  57 camel_url_scanner_free (CamelUrlScanner *scanner)
  58 {
  59         g_return_if_fail (scanner != NULL);
  60
  61         g_ptr_array_free (scanner->patterns, TRUE);
  62         e_trie_free (scanner->trie);
  63         g_free (scanner);
  64 }
  65
  66
  67 void
  68 camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern)
  69 {
  70         g_return_if_fail (scanner != NULL);
  71
  72         e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
  73         g_ptr_array_add (scanner->patterns, pattern);
  74 }
  75
  76
  77 gboolean
  78 camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
  79 {
  80         const char *pos, *inptr, *inend;
  81         urlpattern_t *pat;
  82         int pattern;
  83
  84         g_return_val_if_fail (scanner != NULL, FALSE);
  85         g_return_val_if_fail (in != NULL, FALSE);
  86
  87         inptr = in;
  88         inend = in + inlen;
  89
  90         do {
  91                 if (!(pos = e_trie_search (scanner->trie, inptr, inlen, &pattern)))
  92                         return FALSE;
  93
  94                 pat = g_ptr_array_index (scanner->patterns, pattern);
  95
  96                 match->pattern = pat->pattern;
  97                 match->prefix = pat->prefix;
  98
  99                 if (pat->start (in, pos, inend, match) && pat->end (in, pos, inend, match))
 100                         return TRUE;
 101
 102                 inptr = pos;
 103                 if (camel_utf8_getc_limit (&inptr, (const unsigned char *) inend) == 0xffff)
 104                         break;
 105
 106                 inlen = inend - inptr;
 107         } while (inptr < inend);
 108
 109         return FALSE;
 110 }
 111
 112
 113 static unsigned char url_scanner_table[256] = {
 114           1,  1,  1,  1,  1,  1,  1,  1,  1,  9,  9,  1,  1,  9,  1,  1,
 115           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 116          24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
 117          68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
 118         160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
 119          66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
 120         128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
 121          66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128,  1,
 122           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 123           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 124           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 125           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 126           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 127           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 128           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 129           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
 130 };
 131
 132 enum {
 133         IS_CTRL         = (1 << 0),
 134         IS_ALPHA        = (1 << 1),
 135         IS_DIGIT        = (1 << 2),
 136         IS_LWSP         = (1 << 3),
 137         IS_SPACE        = (1 << 4),
 138         IS_SPECIAL      = (1 << 5),
 139         IS_DOMAIN       = (1 << 6),
 140         IS_URLSAFE      = (1 << 7),
 141 };
 142
 143 #define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
 144 #define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
 145 #define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
 146 #define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
 147 #define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
 148 #define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0)
 149 #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
 150
 151
 152 static const struct {
 153         const char open;
 154         const char close;
 155 } url_braces[] = {
 156         { '(', ')' },
 157         { '{', '}' },
 158         { '[', ']' },
 159         { '<', '>' },
 160         { '|', '|' },
 161         { '\'', '\'' },
 162 };
 163
 164 static gboolean
 165 is_open_brace (char c)
 166 {
 167         int i;
 168
 169         for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
 170                 if (c == url_braces[i].open)
 171                         return TRUE;
 172         }
 173
 174         return FALSE;
 175 }
 176
 177 static char
 178 url_stop_at_brace (const char *in, size_t so, char *open_brace)
 179 {
 180         int i;
 181
 182         if (open_brace != NULL)
 183                 *open_brace = '\0';
 184
 185         if (so > 0) {
 186                 for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
 187                         if (in[so - 1] == url_braces[i].open) {
 188                                 if (open_brace != NULL)
 189                                         *open_brace = url_braces[i].open;
 190                                 return url_braces[i].close;
 191                         }
 192                 }
 193         }
 194
 195         return '\0';
 196 }
 197
 198
 199 gboolean
 200 camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 201 {
 202         register const char *inptr = pos;
 203
 204         g_assert (*inptr == '@');
 205
 206         if (inptr > in)
 207                 inptr--;
 208
 209         while (inptr > in) {
 210                 if (is_atom (*inptr))
 211                         inptr--;
 212                 else
 213                         break;
 214
 215                 while (inptr > in && is_atom (*inptr))
 216                         inptr--;
 217
 218                 if (inptr > in && *inptr == '.')
 219                         inptr--;
 220         }
 221
 222         while (!is_atom (*inptr) || is_open_brace (*inptr))
 223                 inptr++;
 224
 225         if (inptr >= pos)
 226                 return FALSE;
 227
 228         match->um_so = (inptr - in);
 229
 230         return TRUE;
 231 }
 232
 233 gboolean
 234 camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 235 {
 236         const char *inptr = pos;
 237         int parts = 0, digits;
 238         gboolean got_dot = FALSE;
 239
 240         g_assert (*inptr == '@');
 241
 242         inptr++;
 243
 244         if (*inptr == '[') {
 245                 /* domain literal */
 246                 do {
 247                         inptr++;
 248
 249                         digits = 0;
 250                         while (inptr < inend && is_digit (*inptr) && digits < 3) {
 251                                 inptr++;
 252                                 digits++;
 253                         }
 254
 255                         parts++;
 256
 257                         if (*inptr != '.' && parts != 4)
 258                                 return FALSE;
 259                 } while (parts < 4);
 260
 261                 if (*inptr == ']')
 262                         inptr++;
 263                 else
 264                         return FALSE;
 265
 266                 got_dot = TRUE;
 267         } else {
 268                 while (inptr < inend) {
 269                         if (is_domain (*inptr))
 270                                 inptr++;
 271                         else
 272                                 break;
 273
 274                         while (inptr < inend && is_domain (*inptr))
 275                                 inptr++;
 276
 277                         if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
 278                                 if (*inptr == '.')
 279                                         got_dot = TRUE;
 280                                 inptr++;
 281                         }
 282                 }
 283         }
 284
 285         /* don't allow toplevel domains */
 286         if (inptr == pos + 1 || !got_dot)
 287                 return FALSE;
 288
 289         match->um_eo = (inptr - in);
 290
 291         return TRUE;
 292 }
 293
 294 gboolean
 295 camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 296 {
 297         match->um_so = (pos - in);
 298
 299         return TRUE;
 300 }
 301
 302 gboolean
 303 camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 304 {
 305         register const char *inptr = pos;
 306         char close_brace;
 307
 308         inptr += strlen (match->pattern);
 309
 310         if (*inptr == '/')
 311                 inptr++;
 312
 313         close_brace = url_stop_at_brace (in, match->um_so, NULL);
 314
 315         while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
 316                 inptr++;
 317
 318         if (inptr == pos)
 319                 return FALSE;
 320
 321         match->um_eo = (inptr - in);
 322
 323         return TRUE;
 324 }
 325
 326 gboolean
 327 camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 328 {
 329         if (pos > in && !strncmp (pos, "www", 3)) {
 330                 /* make sure we aren't actually part of another word */
 331                 if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
 332                         return FALSE;
 333         }
 334
 335         match->um_so = (pos - in);
 336
 337         return TRUE;
 338 }
 339
 340 gboolean
 341 camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 342 {
 343         register const char *inptr = pos;
 344         gboolean passwd = FALSE;
 345         const char *save;
 346         char close_brace, open_brace;
 347         int brace_stack = 0;
 348         int port;
 349
 350         inptr += strlen (match->pattern);
 351
 352         close_brace = url_stop_at_brace (in, match->um_so, &open_brace);
 353
 354         /* find the end of the domain */
 355         if (is_atom (*inptr)) {
 356                 /* might be a domain or user@domain */
 357                 save = inptr;
 358                 while (inptr < inend) {
 359                         if (!is_atom (*inptr))
 360                                 break;
 361
 362                         inptr++;
 363
 364                         while (inptr < inend && is_atom (*inptr))
 365                                 inptr++;
 366
 367                         if ((inptr + 1) < inend && *inptr == '.' && (is_atom (inptr[1]) || inptr[1] == '/'))
 368                                         inptr++;
 369                 }
 370
 371                 if (*inptr != '@')
 372                         inptr = save;
 373                 else
 374                         inptr++;
 375
 376                 goto domain;
 377         } else if (is_domain (*inptr)) {
 378         domain:
 379                 while (inptr < inend) {
 380                         if (!is_domain (*inptr))
 381                                 break;
 382
 383                         inptr++;
 384
 385                         while (inptr < inend && is_domain (*inptr))
 386                                 inptr++;
 387
 388                         if ((inptr + 1) < inend && *inptr == '.' && (is_domain (inptr[1]) || inptr[1] == '/'))
 389                                         inptr++;
 390                 }
 391         } else {
 392                 return FALSE;
 393         }
 394
 395         if (inptr < inend) {
 396                 switch (*inptr) {
 397                 case ':': /* we either have a port or a password */
 398                         inptr++;
 399
 400                         if (is_digit (*inptr) || passwd) {
 401                                 port = (*inptr++ - '0');
 402
 403                                 while (inptr < inend && is_digit (*inptr) && port < 65536)
 404                                         port = (port * 10) + (*inptr++ - '0');
 405
 406                                 if (!passwd && (port >= 65536 || *inptr == '@')) {
 407                                         if (inptr < inend) {
 408                                                 /* this must be a password? */
 409                                                 goto passwd;
 410                                         }
 411
 412                                         inptr--;
 413                                 }
 414                         } else {
 415                         passwd:
 416                                 passwd = TRUE;
 417                                 save = inptr;
 418
 419                                 while (inptr < inend && is_atom (*inptr))
 420                                         inptr++;
 421
 422                                 if ((inptr + 2) < inend) {
 423                                         if (*inptr == '@') {
 424                                                 inptr++;
 425                                                 if (is_domain (*inptr))
 426                                                         goto domain;
 427                                         }
 428
 429                                         return FALSE;
 430                                 }
 431                         }
 432
 433                         if (inptr >= inend || *inptr != '/')
 434                                 break;
 435
 436                         /* we have a '/' so there could be a path - fall through */
 437                 case '/': /* we've detected a path component to our url */
 438                         inptr++;
 439                 case '?':
 440                         while (inptr < inend && is_urlsafe (*inptr)) {
 441                                 if (*inptr == open_brace) {
 442                                         g_message ("++");
 443                                         brace_stack++;
 444                                 } else if (*inptr == close_brace) {
 445                                         g_message ("--");
 446                                         brace_stack--;
 447                                         if (brace_stack == -1)
 448                                                 break;
 449                                 }
 450                                 inptr++;
 451                         }
 452
 453                         break;
 454                 default:
 455                         break;
 456                 }
 457         }
 458
 459         /* urls are extremely unlikely to end with any
 460          * punctuation, so strip any trailing
 461          * punctuation off. Also strip off any closing
 462          * double-quotes. */
 463         while (inptr > pos && strchr (",.:;?!-|}]\"", inptr[-1]))
 464                 inptr--;
 465
 466
 467         match->um_eo = (inptr - in);
 468
 469         return TRUE;
 470 }
 471
 472
 473
 474 #ifdef BUILD_TABLE
 475
 476 #include <stdio.h>
 477
 478 /* got these from rfc1738 */
 479 #define CHARS_LWSP " \t\n\r"               /* linear whitespace chars */
 480 #define CHARS_SPECIAL "()<>@,;:\\\".[]"
 481
 482 /* got these from rfc1738 */
 483 #define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
 484
 485
 486 static void
 487 table_init_bits (unsigned int mask, const unsigned char *vals)
 488 {
 489         int i;
 490
 491         for (i = 0; vals[i] != '\0'; i++)
 492                 url_scanner_table[vals[i]] |= mask;
 493 }
 494
 495 static void
 496 url_scanner_table_init (void)
 497 {
 498         int i;
 499
 500         for (i = 0; i < 256; i++) {
 501                 url_scanner_table[i] = 0;
 502                 if (i < 32)
 503                         url_scanner_table[i] |= IS_CTRL;
 504                 if ((i >= '0' && i <= '9'))
 505                         url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
 506                 if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
 507                         url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
 508                 if (i >= 127)
 509                         url_scanner_table[i] |= IS_CTRL;
 510         }
 511
 512         url_scanner_table[' '] |= IS_SPACE;
 513         url_scanner_table['-'] |= IS_DOMAIN;
 514
 515         /* not defined to be special in rfc0822, but when scanning
 516            backwards to find the beginning of the email address we do
 517            not want to include this char if we come accross it - so
 518            this is kind of a hack */
 519         url_scanner_table['/'] |= IS_SPECIAL;
 520
 521         table_init_bits (IS_LWSP, CHARS_LWSP);
 522         table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
 523         table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
 524 }
 525
 526 int main (int argc, char **argv)
 527 {
 528         int i;
 529
 530         url_scanner_table_init ();
 531
 532         printf ("static unsigned char url_scanner_table[256] = {");
 533         for (i = 0; i < 256; i++) {
 534                 printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
 535                         url_scanner_table[i], i != 255 ? "," : "\n");
 536         }
 537         printf ("};\n\n");
 538
 539         return 0;
 540 }
 541
 542 #endif /* BUILD_TABLE */