1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Authors: Jeffrey Stedfast <fejj@ximian.com>
4 * Michael Zucchi <NotZed@Ximian.com>
6 * Copyright 2000 Ximian, Inc. (www.ximian.com)
7 * Copyright 2001 Ximian Inc. (www.ximian.com)
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU Lesser General Public
11 * License as published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
29 /* POSIX requires <sys/types.h> be included before <regex.h> */
30 #include <sys/types.h>
38 #include <glib/gi18n-lib.h>
40 #include <libedataserver/e-sexp.h>
42 #include "camel-exception.h"
43 #include "camel-mime-message.h"
44 #include "camel-multipart.h"
45 #include "camel-search-private.h"
46 #include "camel-stream-mem.h"
51 camel_utf8_getc(const unsigned char **ptr)
53 register unsigned char *p = (unsigned char *)*ptr;
54 register unsigned char c, r;
55 register guint32 v, m;
63 } else if (r < 0xfe) { /* valid start char? */
65 m = 0x7f80; /* used to mask out the length bits */
68 if ((c & 0xc0) != 0x80) {
72 v = (v<<6) | (c & 0x3f);
87 /* builds the regex into pattern */
88 /* taken from camel-folder-search, with added isregex & exception parameter */
89 /* Basically, we build a new regex, either based on subset regex's, or substrings,
90 that can be executed once over the whoel body, to match anything suitable.
91 This is more efficient than multiple searches, and probably most (naive) strstr
92 implementations, over long content.
94 A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */
96 camel_search_build_match_regex (regex_t *pattern, camel_search_flags_t type, int argc,
97 struct _ESExpResult **argv, CamelException *ex)
99 GString *match = g_string_new("");
100 int c, i, count=0, err;
104 /* build a regex pattern we can use to match the words, we OR them together */
106 g_string_append_c (match, '(');
107 for (i = 0; i < argc; i++) {
108 if (argv[i]->type == ESEXP_RES_STRING) {
110 g_string_append_c (match, '|');
112 word = argv[i]->value.string;
113 if (type & CAMEL_SEARCH_MATCH_REGEX) {
114 /* no need to escape because this should already be a valid regex */
115 g_string_append (match, word);
117 /* escape any special chars (not sure if this list is complete) */
118 if (type & CAMEL_SEARCH_MATCH_START)
119 g_string_append_c (match, '^');
120 while ((c = *word++)) {
121 if (strchr ("*\\.()[]^$+", c) != NULL) {
122 g_string_append_c (match, '\\');
124 g_string_append_c (match, c);
126 if (type & CAMEL_SEARCH_MATCH_END)
127 g_string_append_c (match, '^');
131 g_warning("Invalid type passed to body-contains match function");
135 g_string_append_c (match, ')');
136 flags = REG_EXTENDED|REG_NOSUB;
137 if (type & CAMEL_SEARCH_MATCH_ICASE)
139 if (type & CAMEL_SEARCH_MATCH_NEWLINE)
140 flags |= REG_NEWLINE;
141 err = regcomp (pattern, match->str, flags);
143 /* regerror gets called twice to get the full error string
144 length to do proper posix error reporting */
145 int len = regerror (err, pattern, 0, 0);
146 char *buffer = g_malloc0 (len + 1);
148 regerror (err, pattern, buffer, len);
149 camel_exception_setv (ex, CAMEL_EXCEPTION_SYSTEM,
150 _("Regular expression compilation failed: %s: %s"),
155 d(printf("Built regex: '%s'\n", match->str));
156 g_string_free (match, TRUE);
161 static unsigned char soundex_table[256] = {
162 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
167 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
168 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
169 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 soundexify (const gchar *sound, gchar code[5])
183 guchar *c, last = '\0';
186 for (c = (guchar *) sound; *c && !isalpha (*c); c++);
187 code[0] = toupper (*c);
188 memset (code + 1, '0', 3);
189 for (n = 1; *c && n < 5; c++) {
190 guchar ch = soundex_table[*c];
192 if (ch && ch != last) {
201 header_soundex (const char *header, const char *match)
203 char mcode[5], hcode[5];
209 soundexify (match, mcode);
211 /* split the header into words, and soundexify and compare each one */
212 /* FIXME: Should this convert to utf8, and split based on that, and what not?
213 soundex only makes sense for us-ascii though ... */
215 word = g_string_new("");
219 if (c == 0 || isspace (c)) {
221 soundexify (word->str, hcode);
222 if (strcmp (hcode, mcode) == 0)
225 g_string_truncate (word, 0);
226 } else if (isalpha (c))
227 g_string_append_c (word, c);
228 } while (c && !truth);
229 g_string_free (word, TRUE);
235 camel_ustrstrcase (const char *haystack, const char *needle)
237 gunichar *nuni, *puni;
239 const unsigned char *p;
241 g_return_val_if_fail (haystack != NULL, NULL);
242 g_return_val_if_fail (needle != NULL, NULL);
244 if (strlen (needle) == 0)
246 if (strlen (haystack) == 0)
249 puni = nuni = g_alloca (sizeof (gunichar) * strlen (needle));
251 p = (const unsigned char *) needle;
252 while ((u = camel_utf8_getc(&p)))
253 *puni++ = g_unichar_tolower (u);
255 /* NULL means there was illegal utf-8 sequence */
259 p = (const unsigned char *)haystack;
260 while ((u = camel_utf8_getc(&p))) {
263 c = g_unichar_tolower (u);
264 /* We have valid stripped char */
266 const unsigned char *q = p;
269 while (nuni + npos < puni) {
270 u = camel_utf8_getc(&q);
274 c = g_unichar_tolower (u);
281 if (nuni + npos == puni)
282 return (const char *) p;
289 #define CAMEL_SEARCH_COMPARE(x, y, z) G_STMT_START { \
295 } else if ((y) == (z)) \
300 camel_ustrcasecmp (const char *s1, const char *s2)
304 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
306 u1 = camel_utf8_getc(&s1);
307 u2 = camel_utf8_getc(&s2);
309 u1 = g_unichar_tolower (u1);
310 u2 = g_unichar_tolower (u2);
316 u1 = camel_utf8_getc(&s1);
317 u2 = camel_utf8_getc(&s2);
320 /* end of one of the strings ? */
321 CAMEL_SEARCH_COMPARE (u1, u2, 0);
323 /* if we have invalid utf8 sequence ? */
324 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
330 camel_ustrncasecmp (const char *s1, const char *s2, size_t len)
334 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
336 u1 = camel_utf8_getc(&s1);
337 u2 = camel_utf8_getc(&s2);
338 while (len > 0 && u1 && u2) {
339 u1 = g_unichar_tolower (u1);
340 u2 = g_unichar_tolower (u2);
347 u1 = camel_utf8_getc(&s1);
348 u2 = camel_utf8_getc(&s2);
354 /* end of one of the strings ? */
355 CAMEL_SEARCH_COMPARE (u1, u2, 0);
357 /* if we have invalid utf8 sequence ? */
358 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
363 /* value is the match value suitable for exact match if required */
365 header_match(const char *value, const char *match, camel_search_match_t how)
367 const unsigned char *p;
371 if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
372 return header_soundex (value, match);
374 vlen = strlen(value);
375 mlen = strlen(match);
379 /* from dan the man, if we have mixed case, perform a case-sensitive match,
381 p = (const unsigned char *)match;
382 while ((c = camel_utf8_getc(&p))) {
383 if (g_unichar_isupper(c)) {
385 case CAMEL_SEARCH_MATCH_EXACT:
386 return strcmp(value, match) == 0;
387 case CAMEL_SEARCH_MATCH_CONTAINS:
388 return strstr(value, match) != NULL;
389 case CAMEL_SEARCH_MATCH_STARTS:
390 return strncmp(value, match, mlen) == 0;
391 case CAMEL_SEARCH_MATCH_ENDS:
392 return strcmp(value + vlen - mlen, match) == 0;
401 case CAMEL_SEARCH_MATCH_EXACT:
402 return camel_ustrcasecmp(value, match) == 0;
403 case CAMEL_SEARCH_MATCH_CONTAINS:
404 return camel_ustrstrcase(value, match) != NULL;
405 case CAMEL_SEARCH_MATCH_STARTS:
406 return camel_ustrncasecmp(value, match, mlen) == 0;
407 case CAMEL_SEARCH_MATCH_ENDS:
408 return camel_ustrcasecmp(value + vlen - mlen, match) == 0;
416 /* searhces for match inside value, if match is mixed case, hten use case-sensitive,
419 camel_search_header_match (const char *value, const char *match, camel_search_match_t how, camel_search_t type, const char *default_charset)
421 const char *name, *addr, *ptr;
422 int truth = FALSE, i;
423 CamelInternetAddress *cia;
424 char *v, *vdom, *mdom;
428 while ((c = camel_utf8_getc(&ptr)) && g_unichar_isspace(c))
432 case CAMEL_SEARCH_TYPE_ENCODED:
433 v = camel_header_decode_string(value, default_charset); /* FIXME: Find header charset */
434 truth = header_match(v, match, how);
437 case CAMEL_SEARCH_TYPE_MLIST:
438 /* Special mailing list old-version domain hack
439 If one of the mailing list names doesn't have an @ in it, its old-style, so
440 only match against the pre-domain part, which should be common */
442 vdom = strchr(value, '@');
443 mdom = strchr(match, '@');
444 if (mdom == NULL && vdom != NULL) {
445 v = g_alloca(vdom-value+1);
446 memcpy(v, value, vdom-value);
449 } else if (mdom != NULL && vdom == NULL) {
450 v = g_alloca(mdom-match+1);
451 memcpy(v, match, mdom-match);
456 case CAMEL_SEARCH_TYPE_ASIS:
457 truth = header_match(value, match, how);
459 case CAMEL_SEARCH_TYPE_ADDRESS_ENCODED:
460 case CAMEL_SEARCH_TYPE_ADDRESS:
461 /* possible simple case to save some work if we can */
462 if (header_match(value, match, how))
465 /* Now we decode any addresses, and try asis matches on name and address parts */
466 cia = camel_internet_address_new();
467 if (type == CAMEL_SEARCH_TYPE_ADDRESS_ENCODED)
468 camel_address_decode((CamelAddress *)cia, value);
470 camel_address_unformat((CamelAddress *)cia, value);
472 for (i=0; !truth && camel_internet_address_get(cia, i, &name, &addr);i++)
473 truth = (name && header_match(name, match, how)) || (addr && header_match(addr, match, how));
475 camel_object_unref (cia);
482 /* performs a 'slow' content-based match */
483 /* there is also an identical copy of this in camel-filter-search.c */
485 camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
487 CamelDataWrapper *containee;
491 containee = camel_medium_get_content_object (CAMEL_MEDIUM (object));
493 if (containee == NULL)
496 /* using the object types is more accurate than using the mime/types */
497 if (CAMEL_IS_MULTIPART (containee)) {
498 parts = camel_multipart_get_number (CAMEL_MULTIPART (containee));
499 for (i = 0; i < parts && truth == FALSE; i++) {
500 CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part (CAMEL_MULTIPART (containee), i);
502 truth = camel_search_message_body_contains (part, pattern);
504 } else if (CAMEL_IS_MIME_MESSAGE (containee)) {
505 /* for messages we only look at its contents */
506 truth = camel_search_message_body_contains ((CamelDataWrapper *)containee, pattern);
507 } else if (camel_content_type_is(CAMEL_DATA_WRAPPER (containee)->mime_type, "text", "*")) {
508 /* for all other text parts, we look inside, otherwise we dont care */
509 CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new ();
511 camel_data_wrapper_write_to_stream (containee, CAMEL_STREAM (mem));
512 camel_stream_write (CAMEL_STREAM (mem), "", 1);
513 truth = regexec (pattern, (char *) mem->buffer->data, 0, NULL, 0) == 0;
514 camel_object_unref (mem);
521 output_c(GString *w, guint32 c, int *type)
526 if (!g_unichar_isalnum(c))
527 *type = CAMEL_SEARCH_WORD_COMPLEX | (*type & CAMEL_SEARCH_WORD_8BIT);
529 c = g_unichar_tolower(c);
532 *type |= CAMEL_SEARCH_WORD_8BIT;
534 /* FIXME: use camel_utf8_putc */
535 utf8len = g_unichar_to_utf8(c, utf8);
537 g_string_append(w, utf8);
541 output_w(GString *w, GPtrArray *list, int type)
543 struct _camel_search_word *word;
546 word = g_malloc0(sizeof(*word));
547 word->word = g_strdup(w->str);
549 g_ptr_array_add(list, word);
550 g_string_truncate(w, 0);
554 struct _camel_search_words *
555 camel_search_words_split(const unsigned char *in)
557 int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
559 struct _camel_search_words *words;
560 GPtrArray *list = g_ptr_array_new();
564 words = g_malloc0(sizeof(*words));
565 w = g_string_new("");
568 c = camel_utf8_getc(&in);
571 || (inquote && c == '"')
572 || (!inquote && g_unichar_isspace(c))) {
573 output_w(w, list, type);
575 type = CAMEL_SEARCH_WORD_SIMPLE;
579 c = camel_utf8_getc(&in);
581 output_c(w, c, &type);
583 output_w(w, list, type);
586 } else if (c == '\"') {
589 output_c(w, c, &type);
594 g_string_free(w, TRUE);
595 words->len = list->len;
596 words->words = (struct _camel_search_word **)list->pdata;
598 g_ptr_array_free(list, FALSE);
603 /* takes an existing 'words' list, and converts it to another consisting of
604 only simple words, with any punctuation etc stripped */
605 struct _camel_search_words *
606 camel_search_words_simple(struct _camel_search_words *wordin)
609 const unsigned char *ptr, *start, *last;
610 int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
611 GPtrArray *list = g_ptr_array_new();
612 struct _camel_search_word *word;
613 struct _camel_search_words *words;
616 words = g_malloc0(sizeof(*words));
618 for (i=0;i<wordin->len;i++) {
619 if ((wordin->words[i]->type & CAMEL_SEARCH_WORD_COMPLEX) == 0) {
620 word = g_malloc0(sizeof(*word));
621 word->type = wordin->words[i]->type;
622 word->word = g_strdup(wordin->words[i]->word);
623 g_ptr_array_add(list, word);
625 ptr = (const unsigned char *) wordin->words[i]->word;
628 c = camel_utf8_getc(&ptr);
629 if (c == 0 || !g_unichar_isalnum(c)) {
631 word = g_malloc0(sizeof(*word));
632 word->word = g_strndup((gchar *) start, last-start);
634 g_ptr_array_add(list, word);
636 type = CAMEL_SEARCH_WORD_SIMPLE;
641 type = CAMEL_SEARCH_WORD_8BIT;
647 words->len = list->len;
648 words->words = (struct _camel_search_word **)list->pdata;
650 g_ptr_array_free(list, FALSE);
656 camel_search_words_free(struct _camel_search_words *words)
660 for (i=0;i<words->len;i++) {
661 struct _camel_search_word *word = words->words[i];
666 g_free(words->words);