camel/camel-search-private.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Authors: Jeffrey Stedfast <fejj@helixcode.com>
   4  *           Michael Zucchi <NotZed@Ximian.com>
   5  *
   6  *  Copyright 2000 Helix Code, Inc. (www.helixcode.com)
   7  *  Copyright 2001 Ximian Inc. (www.ximian.com)
   8  *
   9  *  This program is free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  This program is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  *  GNU General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU General Public License
  20  *  along with this program; if not, write to the Free Software
  21  *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
  22  *
  23  */
  24
  25 /* (from glibc headers:
  26    POSIX says that <sys/types.h> must be included (by the caller) before <regex.h>.  */
  27
  28 #ifdef HAVE_CONFIG_H
  29 #include <config.h>
  30 #endif
  31
  32 #include <sys/types.h>
  33 #include <regex.h>
  34 #include <string.h>
  35 #include <ctype.h>
  36 #include <stdio.h>
  37
  38 #include "camel-exception.h"
  39 #include "camel-mime-message.h"
  40 #include "camel-multipart.h"
  41 #include "camel-stream-mem.h"
  42 #include "e-util/e-sexp.h"
  43 #include <unicode.h>
  44
  45 #include "camel-search-private.h"
  46
  47 #define d(x)
  48
  49 /* builds the regex into pattern */
  50 /* taken from camel-folder-search, with added isregex & exception parameter */
  51 /* Basically, we build a new regex, either based on subset regex's, or substrings,
  52    that can be executed once over the whoel body, to match anything suitable.
  53    This is more efficient than multiple searches, and probably most (naive) strstr
  54    implementations, over long content.
  55
  56    A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */
  57 int
  58 camel_search_build_match_regex (regex_t *pattern, camel_search_flags_t type, int argc,
  59                                 struct _ESExpResult **argv, CamelException *ex)
  60 {
  61         GString *match = g_string_new("");
  62         int c, i, count=0, err;
  63         char *word;
  64         int flags;
  65
  66         /* build a regex pattern we can use to match the words, we OR them together */
  67         if (argc>1)
  68                 g_string_append_c(match, '(');
  69         for (i=0;i<argc;i++) {
  70                 if (argv[i]->type == ESEXP_RES_STRING) {
  71                         if (count > 0)
  72                                 g_string_append_c(match, '|');
  73
  74                         word = argv[i]->value.string;
  75                         if (type & CAMEL_SEARCH_MATCH_REGEX) {
  76                                 /* no need to escape because this should already be a valid regex */
  77                                 g_string_append(match, word);
  78                         } else {
  79                                 /* escape any special chars (not sure if this list is complete) */
  80                                 if (type & CAMEL_SEARCH_MATCH_START)
  81                                         g_string_append_c(match, '^');
  82                                 while ((c = *word++)) {
  83                                         if (strchr("*\\.()[]^$+", c) != NULL) {
  84                                                 g_string_append_c(match, '\\');
  85                                         }
  86                                         g_string_append_c(match, c);
  87                                 }
  88                                 if (type & CAMEL_SEARCH_MATCH_END)
  89                                         g_string_append_c(match, '^');
  90                         }
  91                         count++;
  92                 } else {
  93                         g_warning("Invalid type passed to body-contains match function");
  94                 }
  95         }
  96         if (argc>1)
  97                 g_string_append_c(match, ')');
  98         flags = REG_EXTENDED|REG_NOSUB;
  99         if (type & CAMEL_SEARCH_MATCH_ICASE)
 100                 flags |= REG_ICASE;
 101         err = regcomp(pattern, match->str, flags);
 102         if (err != 0) {
 103                 /* regerror gets called twice to get the full error string
 104                    length to do proper posix error reporting */
 105                 int len = regerror(err, pattern, 0, 0);
 106                 char *buffer = g_malloc0(len + 1);
 107
 108                 regerror(err, pattern, buffer, len);
 109                 camel_exception_setv(ex, CAMEL_EXCEPTION_SYSTEM,
 110                                      _("Regular expression compilation failed: %s: %s"),
 111                                      match->str, buffer);
 112
 113                 regfree(pattern);
 114         }
 115         d(printf("Built regex: '%s'\n", match->str));
 116         g_string_free(match, TRUE);
 117         return err;
 118 }
 119
 120 static unsigned char soundex_table[256] = {
 121           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 122           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 123           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 124           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 125           0,  0, 49, 50, 51,  0, 49, 50,  0,  0, 50, 50, 52, 53, 53,  0,
 126          49, 50, 54, 50, 51,  0, 49,  0, 50,  0, 50,  0,  0,  0,  0,  0,
 127           0,  0, 49, 50, 51,  0, 49, 50,  0,  0, 50, 50, 52, 53, 53,  0,
 128          49, 50, 54, 50, 51,  0, 49,  0, 50,  0, 50,  0,  0,  0,  0,  0,
 129           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 130           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 131           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 132           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 133           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 134           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 135           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 136           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 137 };
 138
 139 static void
 140 soundexify (const gchar *sound, gchar code[5])
 141 {
 142         guchar *c, last = '\0';
 143         gint n;
 144
 145         for (c = (guchar *) sound; *c && !isalpha (*c); c++);
 146         code[0] = toupper (*c);
 147         memset (code + 1, '0', 3);
 148         for (n = 1; *c && n < 5; c++) {
 149                 guchar ch = soundex_table[*c];
 150
 151                 if (ch && ch != last) {
 152                         code[n++] = ch;
 153                         last = ch;
 154                 }
 155         }
 156         code[4] = '\0';
 157 }
 158
 159 static gboolean
 160 header_soundex(const char *header, const char *match)
 161 {
 162         char mcode[5], hcode[5];
 163         const char *p;
 164         char c;
 165         GString *word;
 166         int truth = FALSE;
 167
 168         soundexify(match, mcode);
 169
 170         /* split the header into words, and soundexify and compare each one */
 171         /* FIXME: Should this convert to utf8, and split based on that, and what not?
 172            soundex only makes sense for us-ascii though ... */
 173
 174         word = g_string_new("");
 175         p = header;
 176         do {
 177                 c = *p++;
 178                 if (c == 0 || isspace(c)) {
 179                         if (word->len > 0) {
 180                                 soundexify(word->str, hcode);
 181                                 if (strcmp(hcode, mcode) == 0)
 182                                         truth = TRUE;
 183                         }
 184                         g_string_truncate(word, 0);
 185                 } else if (isalpha(c))
 186                         g_string_append_c(word, c);
 187         } while (c && !truth);
 188         g_string_free(word, TRUE);
 189
 190         return truth;
 191 }
 192
 193 #if 0
 194 /* Why do it this way when the unicode lib already has a function to do this? */
 195 static unicode_char_t
 196 utf8_get (const char **inp)
 197 {
 198         guint32 c, v = 0, s, shift;
 199         const unsigned char *p = *inp;
 200
 201         if (p == NULL)
 202                 return 0;
 203
 204         s = *p++;
 205         if ((s & 0x80) == 0) {  /* 7 bit char */
 206                 v = s;
 207         } else if (s>0xf7) {    /* invalid char, we can only have upto 4 bits encoded */
 208                 p = NULL;
 209         } else if (s>=0xc0) {   /* valid start char */
 210                 shift = 0;
 211                 do {
 212                         c = *p++;
 213                         if ((c & 0xc0) == 0x80) {
 214                                 v = (v<<6) | (c&0x3f);
 215                                 shift += 5;
 216                         } else {
 217                                 *inp = NULL;
 218                                 return 0;
 219                         }
 220                         s <<= 1;
 221                 } while ((s & 0x80) != 0);
 222                 v |= s << shift;
 223         } else {                /* invalid start char, internal char */
 224                 p = NULL;
 225         }
 226
 227         *inp = p;
 228         return v;
 229 }
 230 #endif
 231
 232 static unicode_char_t
 233 utf8_get (const char **inp)
 234 {
 235         const unsigned char *p = *inp;
 236         unicode_char_t c;
 237
 238         g_return_val_if_fail (p != NULL, 0);
 239
 240         p = unicode_get_utf8 (p, &c);
 241         *inp = p;
 242
 243         return c;
 244 }
 245
 246 static const char *
 247 camel_ustrstrcase (const char *haystack, const char *needle)
 248 {
 249         unicode_char_t *nuni, *puni;
 250         unicode_char_t u;
 251         const char *p;
 252
 253         g_return_val_if_fail (haystack != NULL, NULL);
 254         g_return_val_if_fail (needle != NULL, NULL);
 255
 256         if (strlen(needle) == 0)
 257                 return haystack;
 258         if (strlen(haystack) == 0)
 259                 return NULL;
 260
 261         puni = nuni = alloca (sizeof (unicode_char_t) * strlen (needle));
 262
 263         p = needle;
 264         while ((u = utf8_get (&p)))
 265                 *puni++ = unicode_tolower (u);
 266
 267         /* NULL means there was illegal utf-8 sequence */
 268         if (!p)
 269                 return NULL;
 270
 271         p = haystack;
 272         while ((u = utf8_get (&p))) {
 273                 unicode_char_t c;
 274
 275                 c = unicode_tolower (u);
 276                 /* We have valid stripped char */
 277                 if (c == nuni[0]) {
 278                         const gchar *q = p;
 279                         gint npos = 1;
 280
 281                         while (nuni + npos < puni) {
 282                                 u = utf8_get (&q);
 283                                 if (!q || !u)
 284                                         return NULL;
 285
 286                                 c = unicode_tolower (u);
 287                                 if (c != nuni[npos])
 288                                         break;
 289
 290                                 npos++;
 291                         }
 292
 293                         if (nuni + npos == puni)
 294                                 return p;
 295                 }
 296         }
 297
 298         return NULL;
 299 }
 300
 301 #define CAMEL_SEARCH_COMPARE(x, y, z) G_STMT_START {   \
 302         if ((x) == (z)) {                              \
 303                 if ((y) == (z))                        \
 304                         return 0;                      \
 305                 else                                   \
 306                         return -1;                     \
 307         } else if ((y) == (z))                         \
 308                 return 1;                              \
 309 } G_STMT_END
 310
 311 static int
 312 camel_ustrcasecmp (const char *s1, const char *s2)
 313 {
 314         unicode_char_t u1, u2 = 0;
 315
 316         CAMEL_SEARCH_COMPARE (s1, s2, NULL);
 317
 318         u1 = utf8_get (&s1);
 319         u2 = utf8_get (&s2);
 320         while (u1 && u2) {
 321                 u1 = unicode_tolower (u1);
 322                 u2 = unicode_tolower (u2);
 323                 if (u1 < u2)
 324                         return -1;
 325                 else if (u1 > u2)
 326                         return 1;
 327
 328                 u1 = utf8_get (&s1);
 329                 u2 = utf8_get (&s2);
 330         }
 331
 332         /* end of one of the strings ? */
 333         CAMEL_SEARCH_COMPARE (u1, u2, 0);
 334
 335         /* if we have invalid utf8 sequence ?  */
 336         CAMEL_SEARCH_COMPARE (s1, s2, NULL);
 337
 338         return 0;
 339 }
 340
 341 static int
 342 camel_ustrncasecmp (const char *s1, const char *s2, size_t len)
 343 {
 344         unicode_char_t u1, u2 = 0;
 345
 346         CAMEL_SEARCH_COMPARE (s1, s2, NULL);
 347
 348         u1 = utf8_get (&s1);
 349         u2 = utf8_get (&s2);
 350         while (len > 0 && u1 && u2) {
 351                 u1 = unicode_tolower (u1);
 352                 u2 = unicode_tolower (u2);
 353                 if (u1 < u2)
 354                         return -1;
 355                 else if (u1 > u2)
 356                         return 1;
 357
 358                 len--;
 359                 u1 = utf8_get (&s1);
 360                 u2 = utf8_get (&s2);
 361         }
 362
 363         if (len == 0)
 364                 return 0;
 365
 366         /* end of one of the strings ? */
 367         CAMEL_SEARCH_COMPARE (u1, u2, 0);
 368
 369         /* if we have invalid utf8 sequence ?  */
 370         CAMEL_SEARCH_COMPARE (s1, s2, NULL);
 371
 372         return 0;
 373 }
 374
 375
 376 /* searhces for match inside value, if match is mixed case, hten use case-sensitive,
 377    else insensitive */
 378 gboolean
 379 camel_search_header_match (const char *value, const char *match, camel_search_match_t how)
 380 {
 381         const char *p;
 382         int vlen, mlen;
 383
 384         while (*value && isspace (*value))
 385                 value++;
 386
 387         if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
 388                 return header_soundex (value, match);
 389
 390         vlen = strlen (value);
 391         mlen = strlen (match);
 392         if (vlen < mlen)
 393                 return FALSE;
 394
 395         /* from dan the man, if we have mixed case, perform a case-sensitive match,
 396            otherwise not */
 397         p = match;
 398         while (*p) {
 399                 if (isupper(*p)) {
 400                         switch(how) {
 401                         case CAMEL_SEARCH_MATCH_EXACT:
 402                                 return strcmp(value, match) == 0;
 403                         case CAMEL_SEARCH_MATCH_CONTAINS:
 404                                 return strstr(value, match) != NULL;
 405                         case CAMEL_SEARCH_MATCH_STARTS:
 406                                 return strncmp (value, match, mlen) == 0;
 407                         case CAMEL_SEARCH_MATCH_ENDS:
 408                                 return strcmp (value + vlen - mlen, match) == 0;
 409                         default:
 410                                 break;
 411                         }
 412                         return FALSE;
 413                 }
 414                 p++;
 415         }
 416         switch(how) {
 417         case CAMEL_SEARCH_MATCH_EXACT:
 418                 return camel_ustrcasecmp(value, match) == 0;
 419         case CAMEL_SEARCH_MATCH_CONTAINS:
 420                 return camel_ustrstrcase(value, match) != NULL;
 421         case CAMEL_SEARCH_MATCH_STARTS:
 422                 return camel_ustrncasecmp (value, match, mlen) == 0;
 423         case CAMEL_SEARCH_MATCH_ENDS:
 424                 return camel_ustrcasecmp (value + vlen - mlen, match) == 0;
 425         default:
 426                 break;
 427         }
 428
 429         return FALSE;
 430 }
 431
 432 /* performs a 'slow' content-based match */
 433 /* there is also an identical copy of this in camel-filter-search.c */
 434 gboolean
 435 camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern)
 436 {
 437         CamelDataWrapper *containee;
 438         int truth = FALSE;
 439         int parts, i;
 440
 441         containee = camel_medium_get_content_object(CAMEL_MEDIUM(object));
 442
 443         if (containee == NULL)
 444                 return FALSE;
 445
 446         /* TODO: I find it odd that get_part and get_content_object do not
 447            add a reference, probably need fixing for multithreading */
 448
 449         /* using the object types is more accurate than using the mime/types */
 450         if (CAMEL_IS_MULTIPART(containee)) {
 451                 parts = camel_multipart_get_number(CAMEL_MULTIPART(containee));
 452                 for (i=0;i<parts && truth==FALSE;i++) {
 453                         CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part(CAMEL_MULTIPART(containee), i);
 454                         if (part)
 455                                 truth = camel_search_message_body_contains(part, pattern);
 456                 }
 457         } else if (CAMEL_IS_MIME_MESSAGE(containee)) {
 458                 /* for messages we only look at its contents */
 459                 truth = camel_search_message_body_contains((CamelDataWrapper *)containee, pattern);
 460         } else if (header_content_type_is(CAMEL_DATA_WRAPPER(containee)->mime_type, "text", "*")) {
 461                 /* for all other text parts, we look inside, otherwise we dont care */
 462                 CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new();
 463
 464                 camel_data_wrapper_write_to_stream(containee, (CamelStream *)mem);
 465                 camel_stream_write((CamelStream *)mem, "", 1);
 466                 truth = regexec(pattern, mem->buffer->data, 0, NULL, 0) == 0;
 467                 camel_object_unref((CamelObject *)mem);
 468         }
 469         return truth;
 470 }
 471