2 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
3 * Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
5 * This file is part of the device-mapper userspace tools.
7 * This copyrighted material is made available to anyone wishing to use,
8 * modify, copy, or redistribute it subject to the terms and conditions
9 * of the GNU Lesser General Public License v.2.1.
11 * You should have received a copy of the GNU Lesser General Public License
12 * along with this program; if not, write to the Free Software Foundation,
13 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 #include "libdm/misc/dmlib.h"
22 struct dfa_state *next;
25 struct dfa_state *lookup[256];
28 struct dm_regex { /* Instance variables for the lexer */
29 struct dfa_state *start;
31 unsigned num_charsets;
33 struct rx_node **nodes;
35 struct rx_node **charsets;
36 struct dm_pool *scratch, *mem;
38 /* stuff for on the fly dfa calculation */
39 dm_bitset_t charmap[256];
43 struct dfa_state *h, *t;
46 static int _count_nodes(struct rx_node *rx)
51 r += _count_nodes(rx->left);
54 r += _count_nodes(rx->right);
59 static unsigned _count_charsets(struct rx_node *rx)
61 if (rx->type == CHARSET)
64 return (rx->left ? _count_charsets(rx->left) : 0) +
65 (rx->right ? _count_charsets(rx->right) : 0);
68 static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
70 if (rx->type == CHARSET)
71 rx->charset_index = (*i)++;
74 _enumerate_charsets_internal(rx->left, i);
76 _enumerate_charsets_internal(rx->right, i);
80 static void _enumerate_charsets(struct rx_node *rx)
83 _enumerate_charsets_internal(rx, &i);
86 static void _fill_table(struct dm_regex *m, struct rx_node *rx)
88 assert((rx->type != OR) || (rx->left && rx->right));
91 _fill_table(m, rx->left);
94 _fill_table(m, rx->right);
96 m->nodes[m->nodes_entered++] = rx;
97 if (rx->type == CHARSET)
98 m->charsets[m->charsets_entered++] = rx;
101 static int _create_bitsets(struct dm_regex *m)
106 for (i = 0; i < m->num_nodes; i++) {
108 if (!(n->firstpos = dm_bitset_create(m->scratch, m->num_charsets)))
110 if (!(n->lastpos = dm_bitset_create(m->scratch, m->num_charsets)))
112 if (!(n->followpos = dm_bitset_create(m->scratch, m->num_charsets)))
119 static void _calc_functions(struct dm_regex *m)
121 unsigned i, j, final = 1;
122 struct rx_node *rx, *c1, *c2;
124 for (i = 0; i < m->num_nodes; i++) {
129 if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
135 dm_bit_union(rx->firstpos,
136 c1->firstpos, c2->firstpos);
138 dm_bit_copy(rx->firstpos, c1->firstpos);
141 dm_bit_union(rx->lastpos,
142 c1->lastpos, c2->lastpos);
144 dm_bit_copy(rx->lastpos, c2->lastpos);
146 rx->nullable = c1->nullable && c2->nullable;
150 dm_bit_copy(rx->firstpos, c1->firstpos);
151 dm_bit_copy(rx->lastpos, c1->lastpos);
152 rx->nullable = c1->nullable;
156 dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
157 dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
158 rx->nullable = c1->nullable || c2->nullable;
163 dm_bit_copy(rx->firstpos, c1->firstpos);
164 dm_bit_copy(rx->lastpos, c1->lastpos);
169 dm_bit_set(rx->firstpos, rx->charset_index);
170 dm_bit_set(rx->lastpos, rx->charset_index);
175 log_error(INTERNAL_ERROR "Unknown calc node type");
179 * followpos has it's own switch
180 * because PLUS and STAR do the
185 for (j = 0; j < m->num_charsets; j++) {
186 struct rx_node *n = m->charsets[j];
187 if (dm_bit(c1->lastpos, j))
188 dm_bit_union(n->followpos,
189 n->followpos, c2->firstpos);
195 for (j = 0; j < m->num_charsets; j++) {
196 struct rx_node *n = m->charsets[j];
197 if (dm_bit(rx->lastpos, j))
198 dm_bit_union(n->followpos,
199 n->followpos, rx->firstpos);
206 static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
208 return dm_pool_zalloc(mem, sizeof(struct dfa_state));
211 static struct dfa_state *_create_state_queue(struct dm_pool *mem,
212 struct dfa_state *dfa,
215 if (!(dfa->bits = dm_bitset_create(mem, bits[0]))) /* first element is the size */
218 dm_bit_copy(dfa->bits, bits);
225 static int _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a)
228 dm_bitset_t dfa_bits = dfa->bits;
229 dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits);
231 /* iterate through all the states in firstpos */
232 for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) {
233 if (a == TARGET_TRANS)
234 dfa->final = m->charsets[i]->final;
236 dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos);
241 struct dfa_state *tmp;
242 struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1);
245 if (!(ldfa = _create_dfa_state(m->mem)))
248 ttree_insert(m->tt, m->bs + 1, ldfa);
249 if (!(tmp = _create_state_queue(m->scratch, ldfa, m->bs)))
259 dfa->lookup[a] = ldfa;
260 dm_bit_clear_all(m->bs);
266 static int _calc_states(struct dm_regex *m, struct rx_node *rx)
268 unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
269 struct dfa_state *dfa;
274 if (!(m->tt = ttree_create(m->scratch, iwidth)))
277 if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets)))
280 /* build some char maps */
281 for (a = 0; a < 256; a++)
282 if (!(m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets)))
285 for (i = 0; i < m->num_nodes; i++) {
287 if (n->type == CHARSET) {
288 for (a = dm_bit_get_first(n->charset);
289 a >= 0; a = dm_bit_get_next(n->charset, a))
290 dm_bit_set(m->charmap[a], n->charset_index);
294 /* create first state */
295 if (!(dfa = _create_dfa_state(m->mem)))
299 ttree_insert(m->tt, rx->firstpos + 1, dfa);
301 /* prime the queue */
302 if (!(m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos)))
305 if (!(m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets)))
312 * Forces all the dfa states to be calculated up front, ie. what
313 * _calc_states() used to do before we switched to calculating on demand.
315 static int _force_states(struct dm_regex *m)
319 /* keep processing until there's nothing in the queue */
322 /* pop state off front of the queue */
325 /* iterate through all the inputs for this state */
326 dm_bit_clear_all(m->bs);
327 for (a = 0; a < 256; a++)
328 if (!_calc_state(m, s, a))
335 struct dm_regex *dm_regex_create(struct dm_pool *mem, const char * const *patterns,
336 unsigned num_patterns)
343 struct dm_pool *scratch = mem;
345 if (!(m = dm_pool_zalloc(mem, sizeof(*m))))
348 /* join the regexps together, delimiting with zero */
349 for (i = 0; i < num_patterns; i++)
350 len += strlen(patterns[i]) + 8;
352 ptr = all = dm_pool_alloc(scratch, len + 1);
357 for (i = 0; i < num_patterns; i++) {
358 ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
359 if (i < (num_patterns - 1))
363 /* parse this expression */
364 if (!(rx = rx_parse_tok(scratch, all, ptr))) {
365 log_error("Couldn't parse regex");
370 m->scratch = scratch;
371 m->num_nodes = _count_nodes(rx);
372 m->num_charsets = _count_charsets(rx);
373 _enumerate_charsets(rx);
374 if (!(m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes)))
377 if (!(m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets)))
382 if (!_create_bitsets(m))
387 if (!_calc_states(m, rx))
393 dm_pool_free(mem, m);
398 static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r)
400 struct dfa_state *ns;
402 if (!(ns = cs->lookup[(unsigned char) c])) {
403 if (!_calc_state(m, cs, (unsigned char) c))
406 if (!(ns = cs->lookup[(unsigned char) c]))
410 // yuck, we have to special case the target trans
411 if ((ns->final == -1) &&
412 !_calc_state(m, ns, TARGET_TRANS))
415 if (ns->final && (ns->final > *r))
421 int dm_regex_match(struct dm_regex *regex, const char *s)
423 struct dfa_state *cs = regex->start;
426 dm_bit_clear_all(regex->bs);
427 if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r)))
431 if (!(cs = _step_matcher(regex, *s, cs, &r)))
434 _step_matcher(regex, DOLLAR_CHAR, cs, &r);
437 /* subtract 1 to get back to zero index */
442 * The next block of code concerns calculating a fingerprint for the dfa.
444 * We're not calculating a minimal dfa in _calculate_state (maybe a future
445 * improvement). As such it's possible that two non-isomorphic dfas
446 * recognise the same language. This can only really happen if you start
447 * with equivalent, but different regexes (for example the simplifier in
448 * parse_rx.c may have changed).
450 * The code is inefficient; repeatedly searching a singly linked list for
451 * previously seen nodes. Not worried since this is test code.
455 struct dfa_state *node;
456 struct node_list *next;
461 struct node_list *pending;
462 struct node_list *processed;
466 static uint32_t _randomise(uint32_t n)
469 uint32_t const prime = (~0) - 4;
473 static int _seen(struct node_list *n, struct dfa_state *node, uint32_t *i)
476 if (n->node == node) {
487 * Push node if it's not been seen before, returning a unique index.
489 static uint32_t _push_node(struct printer *p, struct dfa_state *node)
494 if (_seen(p->pending, node, &i) ||
495 _seen(p->processed, node, &i))
498 if (!(n = dm_pool_alloc(p->mem, sizeof(*n))))
501 n->node_id = ++p->next_index; /* start from 1, keep 0 as error code */
503 n->next = p->pending;
510 * Pop the front node, and fill out it's previously assigned index.
512 static struct dfa_state *_pop_node(struct printer *p)
514 struct dfa_state *node = NULL;
519 p->pending = n->next;
520 n->next = p->processed;
529 static uint32_t _combine(uint32_t n1, uint32_t n2)
531 return ((n1 << 8) | (n1 >> 24)) ^ _randomise(n2);
534 static uint32_t _fingerprint(struct printer *p)
538 struct dfa_state *node;
540 while ((node = _pop_node(p))) {
541 result = _combine(result, (node->final < 0) ? 0 : node->final);
542 for (c = 0; c < 256; c++)
543 result = _combine(result,
544 _push_node(p, node->lookup[c]));
550 uint32_t dm_regex_fingerprint(struct dm_regex *regex)
554 struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
559 if (!_force_states(regex))
567 if (!_push_node(&p, regex->start))
570 result = _fingerprint(&p);
572 dm_pool_destroy(mem);