1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Authors: Jeffrey Stedfast <fejj@helixcode.com>
4 * Michael Zucchi <NotZed@Ximian.com>
6 * Copyright 2000 Helix Code, Inc. (www.helixcode.com)
7 * Copyright 2001 Ximian Inc. (www.ximian.com)
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
25 /* (from glibc headers:
26 POSIX says that <sys/types.h> must be included (by the caller) before <regex.h>. */
32 #include <sys/types.h>
38 #include "camel-exception.h"
39 #include "camel-mime-message.h"
40 #include "camel-multipart.h"
41 #include "camel-stream-mem.h"
42 #include "e-util/e-sexp.h"
45 #include "camel-search-private.h"
49 /* builds the regex into pattern */
50 /* taken from camel-folder-search, with added isregex & exception parameter */
51 /* Basically, we build a new regex, either based on subset regex's, or substrings,
52 that can be executed once over the whoel body, to match anything suitable.
53 This is more efficient than multiple searches, and probably most (naive) strstr
54 implementations, over long content.
56 A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */
58 camel_search_build_match_regex (regex_t *pattern, camel_search_flags_t type, int argc,
59 struct _ESExpResult **argv, CamelException *ex)
61 GString *match = g_string_new("");
62 int c, i, count=0, err;
66 /* build a regex pattern we can use to match the words, we OR them together */
68 g_string_append_c(match, '(');
69 for (i=0;i<argc;i++) {
70 if (argv[i]->type == ESEXP_RES_STRING) {
72 g_string_append_c(match, '|');
74 word = argv[i]->value.string;
75 if (type & CAMEL_SEARCH_MATCH_REGEX) {
76 /* no need to escape because this should already be a valid regex */
77 g_string_append(match, word);
79 /* escape any special chars (not sure if this list is complete) */
80 if (type & CAMEL_SEARCH_MATCH_START)
81 g_string_append_c(match, '^');
82 while ((c = *word++)) {
83 if (strchr("*\\.()[]^$+", c) != NULL) {
84 g_string_append_c(match, '\\');
86 g_string_append_c(match, c);
88 if (type & CAMEL_SEARCH_MATCH_END)
89 g_string_append_c(match, '^');
93 g_warning("Invalid type passed to body-contains match function");
97 g_string_append_c(match, ')');
98 flags = REG_EXTENDED|REG_NOSUB;
99 if (type & CAMEL_SEARCH_MATCH_ICASE)
101 err = regcomp(pattern, match->str, flags);
103 /* regerror gets called twice to get the full error string
104 length to do proper posix error reporting */
105 int len = regerror(err, pattern, 0, 0);
106 char *buffer = g_malloc0(len + 1);
108 regerror(err, pattern, buffer, len);
109 camel_exception_setv(ex, CAMEL_EXCEPTION_SYSTEM,
110 _("Regular expression compilation failed: %s: %s"),
115 d(printf("Built regex: '%s'\n", match->str));
116 g_string_free(match, TRUE);
120 static unsigned char soundex_table[256] = {
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
126 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
127 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
128 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
140 soundexify (const gchar *sound, gchar code[5])
142 guchar *c, last = '\0';
145 for (c = (guchar *) sound; *c && !isalpha (*c); c++);
146 code[0] = toupper (*c);
147 memset (code + 1, '0', 3);
148 for (n = 1; *c && n < 5; c++) {
149 guchar ch = soundex_table[*c];
151 if (ch && ch != last) {
160 header_soundex(const char *header, const char *match)
162 char mcode[5], hcode[5];
168 soundexify(match, mcode);
170 /* split the header into words, and soundexify and compare each one */
171 /* FIXME: Should this convert to utf8, and split based on that, and what not?
172 soundex only makes sense for us-ascii though ... */
174 word = g_string_new("");
178 if (c == 0 || isspace(c)) {
180 soundexify(word->str, hcode);
181 if (strcmp(hcode, mcode) == 0)
184 g_string_truncate(word, 0);
185 } else if (isalpha(c))
186 g_string_append_c(word, c);
187 } while (c && !truth);
188 g_string_free(word, TRUE);
194 /* Why do it this way when the unicode lib already has a function to do this? */
195 static unicode_char_t
196 utf8_get (const char **inp)
198 guint32 c, v = 0, s, shift;
199 const unsigned char *p = *inp;
205 if ((s & 0x80) == 0) { /* 7 bit char */
207 } else if (s>0xf7) { /* invalid char, we can only have upto 4 bits encoded */
209 } else if (s>=0xc0) { /* valid start char */
213 if ((c & 0xc0) == 0x80) {
214 v = (v<<6) | (c&0x3f);
221 } while ((s & 0x80) != 0);
223 } else { /* invalid start char, internal char */
232 static unicode_char_t
233 utf8_get (const char **inp)
235 const unsigned char *p = *inp;
238 g_return_val_if_fail (p != NULL, 0);
240 p = unicode_get_utf8 (p, &c);
247 camel_ustrstrcase (const char *haystack, const char *needle)
249 unicode_char_t *nuni, *puni;
253 g_return_val_if_fail (haystack != NULL, NULL);
254 g_return_val_if_fail (needle != NULL, NULL);
256 if (strlen(needle) == 0)
258 if (strlen(haystack) == 0)
261 puni = nuni = alloca (sizeof (unicode_char_t) * strlen (needle));
264 while ((u = utf8_get (&p)))
265 *puni++ = unicode_tolower (u);
267 /* NULL means there was illegal utf-8 sequence */
272 while ((u = utf8_get (&p))) {
275 c = unicode_tolower (u);
276 /* We have valid stripped char */
281 while (nuni + npos < puni) {
286 c = unicode_tolower (u);
293 if (nuni + npos == puni)
301 #define CAMEL_SEARCH_COMPARE(x, y, z) G_STMT_START { \
307 } else if ((y) == (z)) \
312 camel_ustrcasecmp (const char *s1, const char *s2)
314 unicode_char_t u1, u2 = 0;
316 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
321 u1 = unicode_tolower (u1);
322 u2 = unicode_tolower (u2);
332 /* end of one of the strings ? */
333 CAMEL_SEARCH_COMPARE (u1, u2, 0);
335 /* if we have invalid utf8 sequence ? */
336 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
342 camel_ustrncasecmp (const char *s1, const char *s2, size_t len)
344 unicode_char_t u1, u2 = 0;
346 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
350 while (len > 0 && u1 && u2) {
351 u1 = unicode_tolower (u1);
352 u2 = unicode_tolower (u2);
366 /* end of one of the strings ? */
367 CAMEL_SEARCH_COMPARE (u1, u2, 0);
369 /* if we have invalid utf8 sequence ? */
370 CAMEL_SEARCH_COMPARE (s1, s2, NULL);
376 /* searhces for match inside value, if match is mixed case, hten use case-sensitive,
379 camel_search_header_match (const char *value, const char *match, camel_search_match_t how)
384 while (*value && isspace (*value))
387 if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
388 return header_soundex (value, match);
390 vlen = strlen (value);
391 mlen = strlen (match);
395 /* from dan the man, if we have mixed case, perform a case-sensitive match,
401 case CAMEL_SEARCH_MATCH_EXACT:
402 return strcmp(value, match) == 0;
403 case CAMEL_SEARCH_MATCH_CONTAINS:
404 return strstr(value, match) != NULL;
405 case CAMEL_SEARCH_MATCH_STARTS:
406 return strncmp (value, match, mlen) == 0;
407 case CAMEL_SEARCH_MATCH_ENDS:
408 return strcmp (value + vlen - mlen, match) == 0;
417 case CAMEL_SEARCH_MATCH_EXACT:
418 return camel_ustrcasecmp(value, match) == 0;
419 case CAMEL_SEARCH_MATCH_CONTAINS:
420 return camel_ustrstrcase(value, match) != NULL;
421 case CAMEL_SEARCH_MATCH_STARTS:
422 return camel_ustrncasecmp (value, match, mlen) == 0;
423 case CAMEL_SEARCH_MATCH_ENDS:
424 return camel_ustrcasecmp (value + vlen - mlen, match) == 0;
432 /* performs a 'slow' content-based match */
433 /* there is also an identical copy of this in camel-filter-search.c */
435 camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern)
437 CamelDataWrapper *containee;
441 containee = camel_medium_get_content_object(CAMEL_MEDIUM(object));
443 if (containee == NULL)
446 /* TODO: I find it odd that get_part and get_content_object do not
447 add a reference, probably need fixing for multithreading */
449 /* using the object types is more accurate than using the mime/types */
450 if (CAMEL_IS_MULTIPART(containee)) {
451 parts = camel_multipart_get_number(CAMEL_MULTIPART(containee));
452 for (i=0;i<parts && truth==FALSE;i++) {
453 CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part(CAMEL_MULTIPART(containee), i);
455 truth = camel_search_message_body_contains(part, pattern);
457 } else if (CAMEL_IS_MIME_MESSAGE(containee)) {
458 /* for messages we only look at its contents */
459 truth = camel_search_message_body_contains((CamelDataWrapper *)containee, pattern);
460 } else if (header_content_type_is(CAMEL_DATA_WRAPPER(containee)->mime_type, "text", "*")) {
461 /* for all other text parts, we look inside, otherwise we dont care */
462 CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new();
464 camel_data_wrapper_write_to_stream(containee, (CamelStream *)mem);
465 camel_stream_write((CamelStream *)mem, "", 1);
466 truth = regexec(pattern, mem->buffer->data, 0, NULL, 0) == 0;
467 camel_object_unref((CamelObject *)mem);