1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2000-2012 Jeffrey Stedfast
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
31 #ifdef ENABLE_WARNINGS
35 #endif /* ENABLE_WARNINGS */
40 struct _trie_state *next;
41 struct _trie_state *fail;
42 struct _trie_match *match;
48 struct _trie_match *next;
49 struct _trie_state *state;
54 struct _trie_state root;
55 GPtrArray *fail_states;
59 static void trie_match_free (struct _trie_match *match);
60 static void trie_state_free (struct _trie_state *state);
62 static struct _trie_match *
65 return g_slice_new (struct _trie_match);
69 trie_match_free (struct _trie_match *match)
71 struct _trie_match *next;
75 trie_state_free (match->state);
76 g_slice_free (struct _trie_match, match);
81 static struct _trie_state *
84 return g_slice_new (struct _trie_state);
88 trie_state_free (struct _trie_state *state)
90 trie_match_free (state->match);
91 g_slice_free (struct _trie_state, state);
95 static inline gunichar
96 trie_utf8_getc (const char **in, size_t inlen)
98 register const unsigned char *inptr = (const unsigned char *) *in;
99 const unsigned char *inend = inptr + inlen;
100 register unsigned char c, r;
101 register gunichar m, u = 0;
108 *in = (const char *) inptr;
110 } else if (r < 0xfe) { /* valid start char? */
112 m = 0x7f80; /* used to mask out the length bits */
118 if ((c & 0xc0) != 0x80)
121 u = (u << 6) | (c & 0x3f);
126 *in = (const char *) inptr;
140 g_trie_new (gboolean icase)
144 trie = g_new (GTrie, 1);
145 trie->root.next = NULL;
146 trie->root.fail = NULL;
147 trie->root.match = NULL;
148 trie->root.final = 0;
150 trie->fail_states = g_ptr_array_new ();
158 g_trie_free (GTrie *trie)
160 g_ptr_array_free (trie->fail_states, TRUE);
161 trie_match_free (trie->root.match);
167 static struct _trie_match *
168 g (struct _trie_state *s, gunichar c)
170 struct _trie_match *m = s->match;
172 while (m && m->c != c)
178 static struct _trie_state *
179 trie_insert (GTrie *trie, guint depth, struct _trie_state *q, gunichar c)
181 struct _trie_match *m;
183 m = trie_match_new ();
188 q = m->state = trie_state_new ();
190 q->fail = &trie->root;
194 if (trie->fail_states->len < depth + 1) {
195 unsigned int size = trie->fail_states->len;
197 size = MAX (size + 64, depth + 1);
198 g_ptr_array_set_size (trie->fail_states, size);
201 q->next = trie->fail_states->pdata[depth];
202 trie->fail_states->pdata[depth] = q;
210 dump_trie (struct _trie_state *s, int depth)
212 char *p = g_alloca ((depth * 2) + 1);
213 struct _trie_match *m;
215 memset (p, ' ', depth * 2);
218 fprintf (stderr, "%s[state] %p: final=%d; pattern-id=%s; fail=%p\n",
219 p, s, s->final, s->id, s->fail);
222 fprintf (stderr, " %s'%c' -> %p\n", p, m->c, m->state);
224 dump_trie (m->state, depth + 1);
237 * IF g(q, pat[p][j]) == null
238 * insert(q, pat[p][j])
240 * q = g(q, pat[p][j])
242 * final = union(final, q)
247 g_trie_add (GTrie *trie, const char *pattern, int pattern_id)
249 const char *inptr = pattern;
250 struct _trie_state *q, *q1, *r;
251 struct _trie_match *m, *n;
255 /* Step 1: add the pattern to the trie */
259 while ((c = trie_utf8_getc (&inptr, -1))) {
261 w(g_warning ("Invalid UTF-8 sequence in pattern '%s' at %s",
262 pattern, (inptr - 1)));
267 c = g_unichar_tolower (c);
271 q = trie_insert (trie, depth, q, c);
282 /* Step 2: compute failure graph */
284 for (i = 0; i < trie->fail_states->len; i++) {
285 q = trie->fail_states->pdata[i];
292 while (r && (n = g (r, c)) == NULL)
297 if (q1->fail->final > q1->final)
298 q1->final = q1->fail->final;
300 if ((n = g (&trie->root, c)))
303 q1->fail = &trie->root;
313 d(fprintf (stderr, "\nafter adding pattern '%s' to trie %p:\n", pattern, trie));
314 d(dump_trie (&trie->root, 0));
322 * WHILE q != fail AND g(q, text[i]) == fail
330 * IF isElement(q, final)
338 g_trie_quick_search (GTrie *trie, const char *buffer, size_t buflen, int *matched_id)
340 const char *inptr, *inend, *prev, *pat;
341 register size_t inlen = buflen;
342 struct _trie_match *m = NULL;
343 struct _trie_state *q;
346 inend = buffer + buflen;
351 while ((c = trie_utf8_getc (&inptr, inlen))) {
352 inlen = (inend - inptr);
355 #ifdef ENABLE_WARNINGS
357 pat = buffer + buflen;
358 g_warning ("Invalid UTF-8 in buffer '%.*s' at %.*s",
359 buflen, buffer, pat - prev, prev);
366 c = g_unichar_tolower (c);
368 while (q != NULL && (m = g (q, c)) == NULL)
371 if (q == &trie->root)
377 } else if (m != NULL) {
395 g_trie_search (GTrie *trie, const char *buffer, size_t buflen, int *matched_id)
397 const char *inptr, *inend, *prev, *pat;
398 register size_t inlen = buflen;
399 struct _trie_match *m = NULL;
400 struct _trie_state *q;
404 inend = buffer + buflen;
409 while ((c = trie_utf8_getc (&inptr, inlen))) {
410 inlen = (inend - inptr);
416 #ifdef ENABLE_WARNINGS
418 pat = buffer + buflen;
419 g_warning ("Invalid UTF-8 in buffer '%.*s' at %.*s",
420 buflen, buffer, pat - prev, prev);
427 c = g_unichar_tolower (c);
429 while (q != NULL && (m = g (q, c)) == NULL && matched == 0)
432 if (q == &trie->root) {
445 } else if (m != NULL) {
448 if (q->final > matched) {
459 return matched ? pat : NULL;
465 static char *patterns[] = {
479 static char *haystacks[] = {
480 "try this url: http://www.ximian.com",
481 "or, feel free to email me at fejj@ximian.com",
482 "don't forget to check out www.ximian.com",
483 "I've attached a file (file:///cvs/gmime/gmime/gtrie.c)",
486 int main (int argc, char **argv)
493 trie = g_trie_new (TRUE);
494 for (i = 0; i < G_N_ELEMENTS (patterns); i++)
495 g_trie_add (trie, patterns[i], i);
497 for (i = 0; i < G_N_ELEMENTS (haystacks); i++) {
498 if ((match = g_trie_search (trie, haystacks[i], -1, &id))) {
499 fprintf (stderr, "matched @ '%s' with pattern '%s'\n", match, patterns[id]);
501 fprintf (stderr, "no match\n");
509 return match == NULL ? 0 : 1;