2 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
5 * This file is part of the device-mapper userspace tools.
7 * This copyrighted material is made available to anyone wishing to use,
8 * modify, copy, or redistribute it subject to the terms and conditions
9 * of the GNU Lesser General Public License v.2.1.
11 * You should have received a copy of the GNU Lesser General Public License
12 * along with this program; if not, write to the Free Software Foundation,
13 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 struct dfa_state *next;
25 struct dfa_state *lookup[256];
28 struct dm_regex { /* Instance variables for the lexer */
29 struct dfa_state *start;
31 unsigned num_charsets;
33 struct rx_node **nodes;
35 struct rx_node **charsets;
36 struct dm_pool *scratch, *mem;
38 /* stuff for on the fly dfa calculation */
39 dm_bitset_t charmap[256];
43 struct dfa_state *h, *t;
46 static int _count_nodes(struct rx_node *rx)
51 r += _count_nodes(rx->left);
54 r += _count_nodes(rx->right);
59 static unsigned _count_charsets(struct rx_node *rx)
61 if (rx->type == CHARSET)
64 return (rx->left ? _count_charsets(rx->left) : 0) +
65 (rx->right ? _count_charsets(rx->right) : 0);
68 static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
70 if (rx->type == CHARSET)
71 rx->charset_index = (*i)++;
74 _enumerate_charsets_internal(rx->left, i);
76 _enumerate_charsets_internal(rx->right, i);
80 static void _enumerate_charsets(struct rx_node *rx)
83 _enumerate_charsets_internal(rx, &i);
86 static void _fill_table(struct dm_regex *m, struct rx_node *rx)
88 assert((rx->type != OR) || (rx->left && rx->right));
91 _fill_table(m, rx->left);
94 _fill_table(m, rx->right);
96 m->nodes[m->nodes_entered++] = rx;
97 if (rx->type == CHARSET)
98 m->charsets[m->charsets_entered++] = rx;
101 static void _create_bitsets(struct dm_regex *m)
105 for (i = 0; i < m->num_nodes; i++) {
106 struct rx_node *n = m->nodes[i];
107 n->firstpos = dm_bitset_create(m->scratch, m->num_charsets);
108 n->lastpos = dm_bitset_create(m->scratch, m->num_charsets);
109 n->followpos = dm_bitset_create(m->scratch, m->num_charsets);
113 static void _calc_functions(struct dm_regex *m)
116 struct rx_node *rx, *c1, *c2;
118 for (i = 0; i < m->num_nodes; i++) {
123 if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
129 dm_bit_union(rx->firstpos,
130 c1->firstpos, c2->firstpos);
132 dm_bit_copy(rx->firstpos, c1->firstpos);
135 dm_bit_union(rx->lastpos,
136 c1->lastpos, c2->lastpos);
138 dm_bit_copy(rx->lastpos, c2->lastpos);
140 rx->nullable = c1->nullable && c2->nullable;
144 dm_bit_copy(rx->firstpos, c1->firstpos);
145 dm_bit_copy(rx->lastpos, c1->lastpos);
146 rx->nullable = c1->nullable;
150 dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
151 dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
152 rx->nullable = c1->nullable || c2->nullable;
157 dm_bit_copy(rx->firstpos, c1->firstpos);
158 dm_bit_copy(rx->lastpos, c1->lastpos);
163 dm_bit_set(rx->firstpos, rx->charset_index);
164 dm_bit_set(rx->lastpos, rx->charset_index);
169 log_error(INTERNAL_ERROR "Unknown calc node type");
173 * followpos has it's own switch
174 * because PLUS and STAR do the
179 for (j = 0; j < m->num_charsets; j++) {
180 struct rx_node *n = m->charsets[j];
181 if (dm_bit(c1->lastpos, j))
182 dm_bit_union(n->followpos,
183 n->followpos, c2->firstpos);
189 for (j = 0; j < m->num_charsets; j++) {
190 struct rx_node *n = m->charsets[j];
191 if (dm_bit(rx->lastpos, j))
192 dm_bit_union(n->followpos,
193 n->followpos, rx->firstpos);
200 static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
202 return dm_pool_zalloc(mem, sizeof(struct dfa_state));
205 static struct dfa_state *_create_state_queue(struct dm_pool *mem,
206 struct dfa_state *dfa,
209 dfa->bits = dm_bitset_create(mem, bits[0]); /* first element is the size */
210 dm_bit_copy(dfa->bits, bits);
216 static void _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a)
219 dm_bitset_t dfa_bits = dfa->bits;
220 dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits);
222 /* iterate through all the states in firstpos */
223 for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) {
224 if (a == TARGET_TRANS)
225 dfa->final = m->charsets[i]->final;
227 dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos);
232 struct dfa_state *tmp;
233 struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1);
236 ldfa = _create_dfa_state(m->mem);
237 ttree_insert(m->tt, m->bs + 1, ldfa);
238 tmp = _create_state_queue(m->scratch, ldfa, m->bs);
247 dfa->lookup[a] = ldfa;
248 dm_bit_clear_all(m->bs);
252 static int _calc_states(struct dm_regex *m, struct rx_node *rx)
254 unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
255 struct dfa_state *dfa;
258 m->tt = ttree_create(m->scratch, iwidth);
262 if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets)))
265 /* build some char maps */
266 for (a = 0; a < 256; a++) {
267 m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets);
272 for (i = 0; i < m->num_nodes; i++) {
273 struct rx_node *n = m->nodes[i];
274 if (n->type == CHARSET) {
275 for (a = dm_bit_get_first(n->charset);
276 a >= 0; a = dm_bit_get_next(n->charset, a))
277 dm_bit_set(m->charmap[a], n->charset_index);
281 /* create first state */
282 dfa = _create_dfa_state(m->mem);
284 ttree_insert(m->tt, rx->firstpos + 1, dfa);
286 /* prime the queue */
287 m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos);
288 m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets);
293 * Forces all the dfa states to be calculated up front, ie. what
294 * _calc_states() used to do before we switched to calculating on demand.
296 static void _force_states(struct dm_regex *m)
300 /* keep processing until there's nothing in the queue */
303 /* pop state off front of the queue */
306 /* iterate through all the inputs for this state */
307 dm_bit_clear_all(m->bs);
308 for (a = 0; a < 256; a++)
309 _calc_state(m, s, a);
313 struct dm_regex *dm_regex_create(struct dm_pool *mem, const char * const *patterns,
314 unsigned num_patterns)
321 struct dm_pool *scratch = mem;
323 if (!(m = dm_pool_zalloc(mem, sizeof(*m))))
326 /* join the regexps together, delimiting with zero */
327 for (i = 0; i < num_patterns; i++)
328 len += strlen(patterns[i]) + 8;
330 ptr = all = dm_pool_alloc(scratch, len + 1);
335 for (i = 0; i < num_patterns; i++) {
336 ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
337 if (i < (num_patterns - 1))
341 /* parse this expression */
342 if (!(rx = rx_parse_tok(scratch, all, ptr))) {
343 log_error("Couldn't parse regex");
348 m->scratch = scratch;
349 m->num_nodes = _count_nodes(rx);
350 m->num_charsets = _count_charsets(rx);
351 _enumerate_charsets(rx);
352 m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes);
356 m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets);
367 dm_pool_free(mem, m);
371 static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r)
373 struct dfa_state *ns;
375 if (!(ns = cs->lookup[(unsigned char) c])) {
376 _calc_state(m, cs, (unsigned char) c);
377 if (!(ns = cs->lookup[(unsigned char) c]))
381 // yuck, we have to special case the target trans
383 _calc_state(m, ns, TARGET_TRANS);
385 if (ns->final && (ns->final > *r))
391 int dm_regex_match(struct dm_regex *regex, const char *s)
393 struct dfa_state *cs = regex->start;
396 dm_bit_clear_all(regex->bs);
397 if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r)))
401 if (!(cs = _step_matcher(regex, *s, cs, &r)))
404 _step_matcher(regex, DOLLAR_CHAR, cs, &r);
407 /* subtract 1 to get back to zero index */
412 * The next block of code concerns calculating a fingerprint for the dfa.
414 * We're not calculating a minimal dfa in _calculate_state (maybe a future
415 * improvement). As such it's possible that two non-isomorphic dfas
416 * recognise the same language. This can only really happen if you start
417 * with equivalent, but different regexes (for example the simplifier in
418 * parse_rx.c may have changed).
420 * The code is inefficient; repeatedly searching a singly linked list for
421 * previously seen nodes. Not worried since this is test code.
425 struct dfa_state *node;
426 struct node_list *next;
431 struct node_list *pending;
432 struct node_list *processed;
436 static uint32_t randomise_(uint32_t n)
439 uint32_t const prime = (~0) - 4;
443 static int seen_(struct node_list *n, struct dfa_state *node, uint32_t *i)
446 if (n->node == node) {
457 * Push node if it's not been seen before, returning a unique index.
459 static uint32_t push_node_(struct printer *p, struct dfa_state *node)
462 if (seen_(p->pending, node, &i) ||
463 seen_(p->processed, node, &i))
466 struct node_list *n = dm_pool_alloc(p->mem, sizeof(*n));
468 n->node_id = p->next_index++;
470 n->next = p->pending;
477 * Pop the front node, and fill out it's previously assigned index.
479 static struct dfa_state *pop_node_(struct printer *p)
481 struct dfa_state *node = NULL;
484 struct node_list *n = p->pending;
485 p->pending = n->next;
486 n->next = p->processed;
495 static uint32_t combine_(uint32_t n1, uint32_t n2)
497 return ((n1 << 8) | (n1 >> 24)) ^ randomise_(n2);
500 static uint32_t fingerprint_(struct printer *p)
504 struct dfa_state *node;
506 while ((node = pop_node_(p))) {
507 result = combine_(result, node->final < 0 ? 0 : node->final);
508 for (c = 0; c < 256; c++)
509 result = combine_(result,
510 push_node_(p, node->lookup[c]));
516 uint32_t dm_regex_fingerprint(struct dm_regex *regex)
520 struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
522 _force_states(regex);
530 push_node_(&p, regex->start);
531 result = fingerprint_(&p);
532 dm_pool_destroy(mem);