src/fa.c

   1 /*
   2  * fa.c: finite automata
   3  *
   4  * Copyright (C) 2007-2015 David Lutterkort
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  19  *
  20  * Author: David Lutterkort <dlutter@redhat.com>
  21  */
  22
  23 /*
  24  * This implementation follows closely the Java dk.brics.automaton package
  25  * by Anders Moeller. The project's website is
  26  * http://www.brics.dk/automaton/.
  27  *
  28  * It is by no means a complete reimplementation of that package; only a
  29  * subset of what Automaton provides is implemented here.
  30  */
  31
  32 #include <config.h>
  33 #include <limits.h>
  34 #include <ctype.h>
  35 #include <stdbool.h>
  36
  37 #include "internal.h"
  38 #include "memory.h"
  39 #include "ref.h"
  40 #include "hash.h"
  41 #include "fa.h"
  42
  43 #define UCHAR_NUM (UCHAR_MAX+1)
  44 #define UCHAR_MIN 0
  45 typedef unsigned char uchar;
  46
  47 #define E(cond) if (cond) goto error
  48 #define F(expr) if ((expr) < 0) goto error
  49
  50 /* Which algorithm to use in FA_MINIMIZE */
  51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
  52
  53 /* A finite automaton. INITIAL is both the initial state and the head of
  54  * the list of all states. Any state that is allocated for this automaton
  55  * is put on this list. Dead/unreachable states are cleared from the list
  56  * at opportune times (e.g., during minimization) It's poor man's garbage
  57  * collection
  58  *
  59  * Normally, transitions are on a character range [min..max]; in
  60  * fa_as_regexp, we store regexps on transitions in the re field of each
  61  * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
  62  * produce proper graphs of an automaton transitioning on regexps.
  63  *
  64  * For case-insensitive regexps (nocase == 1), the FA never has transitions
  65  * on uppercase letters [A-Z], effectively removing these letters from the
  66  * alphabet.
  67  */
  68 struct fa {
  69     struct state *initial;
  70     int           deterministic : 1;
  71     int           minimal : 1;
  72     unsigned int  nocase : 1;
  73     int           trans_re : 1;
  74 };
  75
  76 /* A state in a finite automaton. Transitions are never shared between
  77    states so that we can free the list when we need to free the state */
  78 struct state {
  79     struct state *next;
  80     hash_val_t    hash;
  81     unsigned int  accept : 1;
  82     unsigned int  live : 1;
  83     unsigned int  reachable : 1;
  84     unsigned int  visited : 1;   /* Used in various places to track progress */
  85     /* Array of transitions. The TUSED first entries are used, the array
  86        has allocated room for TSIZE */
  87     size_t        tused;
  88     size_t        tsize;
  89     struct trans *trans;
  90 };
  91
  92 /* A transition. If the input has a character in the inclusive
  93  * range [MIN, MAX], move to TO
  94  */
  95 struct trans {
  96     struct state *to;
  97     union {
  98         struct {
  99             uchar         min;
 100             uchar         max;
 101         };
 102         struct re *re;
 103     };
 104 };
 105
 106 /*
 107  * Bitsets
 108  */
 109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
 110
 111 typedef unsigned int bitset;
 112
 113 static bitset *bitset_init(size_t nbits) {
 114     bitset *bs;
 115     if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
 116         return NULL;
 117     return bs;
 118 }
 119
 120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
 121     bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
 122 }
 123
 124 static inline void bitset_set(bitset *bs, unsigned int bit) {
 125     bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
 126 }
 127
 128 ATTRIBUTE_PURE
 129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
 130     return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
 131 }
 132
 133 ATTRIBUTE_PURE
 134 static inline bool bitset_disjoint(const bitset *const bs1,
 135                                    const bitset *const bs2,
 136                                    size_t nbits) {
 137     for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
 138         if (bs1[i] & bs2[i])
 139             return false;
 140     }
 141     return true;
 142 }
 143
 144 static void bitset_free(bitset *bs) {
 145     free(bs);
 146 }
 147
 148 static void bitset_negate(bitset *bs, size_t nbits) {
 149     for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
 150         bs[i] = ~ bs[i];
 151 }
 152
 153 /*
 154  * Representation of a parsed regular expression. The regular expression is
 155  * parsed according to the following grammar by PARSE_REGEXP:
 156  *
 157  * regexp:        concat_exp ('|' regexp)?
 158  * concat_exp:    repeated_exp concat_exp?
 159  * repeated_exp:  simple_exp
 160  *              | simple_exp '*'
 161  *              | simple_exp '+'
 162  *              | simple_exp '?'
 163  *              | simple_exp '{' INT (',' INT '}')?
 164  * simple_exp:  char_class
 165  *            | '.'
 166  *            |  '(' regexp ')'
 167  *            |  CHAR
 168  * char_class: '[' char_exp ']'
 169  *           | '[' '^' char_exp ']'
 170  * char_exp:   CHAR '-' CHAR
 171  *           | CHAR
 172  */
 173
 174 enum re_type {
 175     UNION,
 176     CONCAT,
 177     CSET,
 178     CHAR,
 179     ITER,
 180     EPSILON
 181 };
 182
 183 #define re_unref(r) unref(r, re)
 184
 185 struct re {
 186     ref_t        ref;
 187     enum re_type type;
 188     union {
 189         struct {                  /* UNION, CONCAT */
 190             struct re *exp1;
 191             struct re *exp2;
 192         };
 193         struct {                  /* CSET */
 194             bool    negate;
 195             bitset *cset;
 196             /* Whether we can use character ranges when converting back
 197              * to a string */
 198             unsigned int no_ranges:1;
 199         };
 200         struct {                  /* CHAR */
 201             uchar c;
 202         };
 203         struct {                  /* ITER */
 204             struct re *exp;
 205             int min;
 206             int max;
 207         };
 208     };
 209 };
 210
 211 /* Used to keep state of the regex parse; RX may contain NUL's */
 212 struct re_parse {
 213     const char *rx;          /* Current position in regex */
 214     const char *rend;        /* Last char of rx+ 1 */
 215     int         error;       /* error code */
 216     /* Whether new CSET's should have the no_ranges flag set */
 217     unsigned int no_ranges:1;
 218 };
 219
 220 /* String with explicit length, used when converting re to string */
 221 struct re_str {
 222     char  *rx;
 223     size_t len;
 224 };
 225
 226 static struct re *parse_regexp(struct re_parse *parse);
 227
 228 /* A map from a set of states to a state. */
 229 typedef hash_t state_set_hash;
 230
 231 static hash_val_t ptr_hash(const void *p);
 232
 233 static const int array_initial_size = 4;
 234 static const int array_max_expansion   = 128;
 235
 236 enum state_set_init_flags {
 237     S_NONE   = 0,
 238     S_SORTED = (1 << 0),
 239     S_DATA   = (1 << 1)
 240 };
 241
 242 struct state_set {
 243     size_t            size;
 244     size_t            used;
 245     unsigned int      sorted : 1;
 246     unsigned int      with_data : 1;
 247     struct state    **states;
 248     void            **data;
 249 };
 250
 251 struct state_set_list {
 252     struct state_set_list *next;
 253     struct state_set      *set;
 254 };
 255
 256 /* Clean up FA by removing dead transitions and states and reducing
 257  * transitions. Unreachable states are freed. The return value is the same
 258  * as FA; returning it is merely a convenience.
 259  *
 260  * Only automata in this state should be returned to the user
 261  */
 262 ATTRIBUTE_RETURN_CHECK
 263 static int collect(struct fa *fa);
 264
 265 ATTRIBUTE_RETURN_CHECK
 266 static int totalize(struct fa *fa);
 267
 268 /* Print an FA into a (fairly) fixed file if the environment variable
 269  * FA_DOT_DIR is set. This code is only used for debugging
 270  */
 271 #define FA_DOT_DIR "FA_DOT_DIR"
 272
 273 ATTRIBUTE_UNUSED
 274 static void fa_dot_debug(struct fa *fa, const char *tag) {
 275     const char *dot_dir;
 276     static int count = 0;
 277     int r;
 278     char *fname;
 279     FILE *fp;
 280
 281     if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
 282         return;
 283
 284     r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
 285     if (r == -1)
 286         return;
 287
 288     fp = fopen(fname, "w");
 289     fa_dot(fp, fa);
 290     fclose(fp);
 291     free(fname);
 292 }
 293
 294 static void print_char_set(struct re *set) {
 295     int from, to;
 296
 297     if (set->negate)
 298         printf("[^");
 299     else
 300         printf("[");
 301     for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
 302         while (bitset_get(set->cset, from) == set->negate)
 303             from += 1;
 304         if (from > UCHAR_MAX)
 305             break;
 306         for (to = from;
 307              to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
 308              to++);
 309         if (to == from) {
 310             printf("%c", from);
 311         } else {
 312             printf("%c-%c", from, to);
 313         }
 314     }
 315     printf("]");
 316 }
 317
 318 ATTRIBUTE_UNUSED
 319 static void print_re(struct re *re) {
 320     switch(re->type) {
 321     case UNION:
 322         print_re(re->exp1);
 323         printf("|");
 324         print_re(re->exp2);
 325         break;
 326     case CONCAT:
 327         print_re(re->exp1);
 328         printf(".");
 329         print_re(re->exp2);
 330         break;
 331     case CSET:
 332         print_char_set(re);
 333         break;
 334     case CHAR:
 335         printf("%c", re->c);
 336         break;
 337     case ITER:
 338         printf("(");
 339         print_re(re->exp);
 340         printf("){%d,%d}", re->min, re->max);
 341         break;
 342     case EPSILON:
 343         printf("<>");
 344         break;
 345     default:
 346         printf("(**)");
 347         break;
 348     }
 349 }
 350
 351 /*
 352  * struct re_str
 353  */
 354 static void release_re_str(struct re_str *str) {
 355     if (str == NULL)
 356         return;
 357     FREE(str->rx);
 358     str->len = 0;
 359 }
 360
 361 static void free_re_str(struct re_str *str) {
 362     if (str == NULL)
 363         return;
 364     release_re_str(str);
 365     FREE(str);
 366 }
 367
 368 static struct re_str *make_re_str(const char *s) {
 369     struct re_str *str;
 370
 371     if (ALLOC(str) < 0)
 372         return NULL;
 373     if (s != NULL) {
 374         str->rx = strdup(s);
 375         str->len = strlen(s);
 376         if (str->rx == NULL) {
 377             FREE(str);
 378             return NULL;
 379         }
 380     }
 381     return str;
 382 }
 383
 384 static int re_str_alloc(struct re_str *str) {
 385     return ALLOC_N(str->rx, str->len + 1);
 386 }
 387
 388 /*
 389  * Memory management
 390  */
 391
 392 static void free_trans(struct state *s) {
 393     free(s->trans);
 394     s->trans = NULL;
 395     s->tused = s->tsize = 0;
 396 }
 397
 398 static void gut(struct fa *fa) {
 399     list_for_each(s, fa->initial) {
 400         free_trans(s);
 401     }
 402     list_free(fa->initial);
 403     fa->initial = NULL;
 404 }
 405
 406 void fa_free(struct fa *fa) {
 407     if (fa == NULL)
 408         return;
 409     gut(fa);
 410     free(fa);
 411 }
 412
 413 static struct state *make_state(void) {
 414     struct state *s;
 415     if (ALLOC(s) == -1)
 416         return NULL;
 417     s->hash = ptr_hash(s);
 418     return s;
 419 }
 420
 421 static struct state *add_state(struct fa *fa, int accept) {
 422     struct state *s = make_state();
 423     if (s) {
 424         s->accept = accept;
 425         if (fa->initial == NULL) {
 426             fa->initial = s;
 427         } else {
 428             list_cons(fa->initial->next, s);
 429         }
 430     }
 431     return s;
 432 }
 433
 434 #define last_trans(s)  ((s)->trans + (s)->tused - 1)
 435
 436 #define for_each_trans(t, s)                                            \
 437     for (struct trans *t = (s)->trans;                                  \
 438          (t - (s)->trans) < (s)->tused;                                 \
 439          t++)
 440
 441 ATTRIBUTE_RETURN_CHECK
 442 static int add_new_trans(struct state *from, struct state *to,
 443                          uchar min, uchar max) {
 444     assert(to != NULL);
 445
 446     if (from->tused == from->tsize) {
 447         size_t tsize = from->tsize;
 448         if (tsize == 0)
 449             tsize = array_initial_size;
 450         else if (from->tsize > array_max_expansion)
 451             tsize += array_max_expansion;
 452         else
 453             tsize *= 2;
 454         if (REALLOC_N(from->trans, tsize) == -1)
 455             return -1;
 456         from->tsize = tsize;
 457     }
 458     from->trans[from->tused].to  = to;
 459     from->trans[from->tused].min = min;
 460     from->trans[from->tused].max = max;
 461     from->tused += 1;
 462     return 0;
 463 }
 464
 465 ATTRIBUTE_RETURN_CHECK
 466 static int add_epsilon_trans(struct state *from,
 467                              struct state *to) {
 468     int r;
 469     from->accept |= to->accept;
 470     for_each_trans(t, to) {
 471         r = add_new_trans(from, t->to, t->min, t->max);
 472         if (r < 0)
 473             return -1;
 474     }
 475     return 0;
 476 }
 477
 478 static void set_initial(struct fa *fa, struct state *s) {
 479     list_remove(s, fa->initial);
 480     list_cons(fa->initial, s);
 481 }
 482
 483 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
 484    and then frees FA2. It has no influence on the language accepted by FA1
 485 */
 486 static void fa_merge(struct fa *fa1, struct fa **fa2) {
 487     list_append(fa1->initial, (*fa2)->initial);
 488     free(*fa2);
 489     *fa2 = NULL;
 490 }
 491
 492 /*
 493  * Operations on STATE_SET
 494  */
 495 static void state_set_free(struct state_set *set) {
 496     if (set == NULL)
 497         return;
 498     free(set->states);
 499     free(set->data);
 500     free(set);
 501 }
 502
 503 static int state_set_init_data(struct state_set *set) {
 504     set->with_data = 1;
 505     if (set->data == NULL)
 506         return ALLOC_N(set->data, set->size);
 507     else
 508         return 0;
 509 }
 510
 511 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
 512    the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
 513    some options:
 514    - S_SORTED: keep the states in the set sorted by their address, and use
 515      binary search for lookups. If it is not set, entries are kept in the
 516      order in which they are added and lookups scan linearly through the
 517      set of states.
 518    - S_DATA: allocate the DATA array in the set, and keep its size in sync
 519      with the size of the STATES array.
 520 */
 521 static struct state_set *state_set_init(int size, int flags) {
 522     struct state_set *set = NULL;
 523
 524     F(ALLOC(set));
 525
 526     set->sorted = (flags & S_SORTED) ? 1 : 0;
 527     set->with_data = (flags & S_DATA) ? 1 : 0;
 528     if (size > 0) {
 529         set->size = size;
 530         F(ALLOC_N(set->states, set->size));
 531         if (set->with_data)
 532             F(state_set_init_data(set));
 533     }
 534     return set;
 535  error:
 536     state_set_free(set);
 537     return NULL;
 538 }
 539
 540 ATTRIBUTE_RETURN_CHECK
 541 static int state_set_expand(struct state_set *set) {
 542     if (set->size == 0)
 543         set->size = array_initial_size;
 544     else if (set->size > array_max_expansion)
 545         set->size += array_max_expansion;
 546     else
 547         set->size *= 2;
 548     if (REALLOC_N(set->states, set->size) < 0)
 549         goto error;
 550     if (set->with_data)
 551         if (REALLOC_N(set->data, set->size) < 0)
 552             goto error;
 553     return 0;
 554  error:
 555     /* We do this to provoke a SEGV as early as possible */
 556     FREE(set->states);
 557     FREE(set->data);
 558     return -1;
 559 }
 560
 561 /* Return the index where S belongs in SET->STATES to keep it sorted.  S
 562    may not be in SET->STATES. The returned index is in the interval [0
 563    .. SET->USED], with the latter indicating that S is larger than all
 564    values in SET->STATES
 565 */
 566 static int state_set_pos(const struct state_set *set, const struct state *s) {
 567     int l = 0, h = set->used;
 568     while (l < h) {
 569         int m = (l + h)/2;
 570         if (set->states[m] > s)
 571             h = m;
 572         else if (set->states[m] < s)
 573             l = m + 1;
 574         else
 575             return m;
 576     }
 577     return l;
 578 }
 579
 580 ATTRIBUTE_RETURN_CHECK
 581 static int state_set_push(struct state_set *set, struct state *s) {
 582     if (set->size == set->used)
 583         if (state_set_expand(set) < 0)
 584             return -1;
 585     if (set->sorted) {
 586         int p = state_set_pos(set, s);
 587         if (set->size == set->used)
 588             if (state_set_expand(set) < 0)
 589                 return -1;
 590         while (p < set->used && set->states[p] <= s)
 591             p += 1;
 592         if (p < set->used) {
 593             memmove(set->states + p + 1, set->states + p,
 594                     sizeof(*set->states) * (set->used - p));
 595             if (set->data != NULL)
 596                 memmove(set->data + p + 1, set->data + p,
 597                         sizeof(*set->data) * (set->used - p));
 598         }
 599         set->states[p] = s;
 600         set->used += 1;
 601         return p;
 602     } else {
 603         set->states[set->used++] = s;
 604         return set->used - 1;
 605     }
 606 }
 607
 608 ATTRIBUTE_RETURN_CHECK
 609 static int state_set_push_data(struct state_set *set, struct state *s,
 610                                void *d) {
 611     int i = state_set_push(set, s);
 612     if (i == -1)
 613         return -1;
 614     set->data[i] = d;
 615     return i;
 616 }
 617
 618 static int state_set_index(const struct state_set *set,
 619                            const struct state *s) {
 620     if (set->sorted) {
 621         int p = state_set_pos(set, s);
 622         return (p < set->used && set->states[p] == s) ? p : -1;
 623     } else {
 624         for (int i=0; i < set->used; i++) {
 625             if (set->states[i] == s)
 626                 return i;
 627         }
 628     }
 629     return -1;
 630 }
 631
 632 static void state_set_remove(struct state_set *set,
 633                              const struct state *s) {
 634    if (set->sorted) {
 635        int p = state_set_index(set, s);
 636        if (p == -1) return;
 637        memmove(set->states + p, set->states + p + 1,
 638                sizeof(*set->states) * (set->used - p - 1));
 639        if (set->data != NULL)
 640            memmove(set->data + p, set->data + p + 1,
 641                    sizeof(*set->data) * (set->used - p - 1));
 642    } else {
 643        int p = state_set_index(set, s);
 644        if (p >= 0) {
 645            set->states[p] = set->states[--set->used];
 646        }
 647    }
 648 }
 649
 650 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
 651    already in the set and -1 on error. */
 652 ATTRIBUTE_RETURN_CHECK
 653 static int state_set_add(struct state_set *set, struct state *s) {
 654     if (set->sorted) {
 655         int p = state_set_pos(set, s);
 656         if (p < set->used && set->states[p] == s)
 657             return 0;
 658         if (set->size == set->used)
 659             if (state_set_expand(set) < 0)
 660                 return -1;
 661         while (p < set->used && set->states[p] <= s)
 662             p += 1;
 663         if (p < set->used) {
 664             memmove(set->states + p + 1, set->states + p,
 665                     sizeof(*set->states) * (set->used - p));
 666             if (set->data != NULL)
 667                 memmove(set->data + p + 1, set->data + p,
 668                         sizeof(*set->data) * (set->used - p));
 669         }
 670         set->states[p] = s;
 671         set->used += 1;
 672     } else {
 673         if (state_set_index(set, s) >= 0)
 674             return 0;
 675         if (state_set_push(set, s) < 0)
 676             goto error;
 677     }
 678     return 1;
 679  error:
 680     /* We do this to provoke a SEGV as early as possible */
 681     FREE(set->states);
 682     FREE(set->data);
 683     return -1;
 684 }
 685
 686 static struct state *state_set_pop(struct state_set *set) {
 687     struct state *s = NULL;
 688     if (set->used > 0)
 689         s = set->states[--set->used];
 690     return s;
 691 }
 692
 693 static struct state *state_set_pop_data(struct state_set *set, void **d) {
 694     struct state *s = NULL;
 695     s = state_set_pop(set);
 696     *d = set->data[set->used];
 697     return s;
 698 }
 699
 700 static void *state_set_find_data(struct state_set *set, struct state *s) {
 701     int i = state_set_index(set, s);
 702     if (i >= 0)
 703         return set->data[i];
 704     else
 705         return NULL;
 706 }
 707
 708 static int state_set_equal(const struct state_set *set1,
 709                            const struct state_set *set2) {
 710     if (set1->used != set2->used)
 711         return 0;
 712     if (set1->sorted && set2->sorted) {
 713         for (int i = 0; i < set1->used; i++)
 714             if (set1->states[i] != set2->states[i])
 715                 return 0;
 716         return 1;
 717     } else {
 718         for (int i=0; i < set1->used; i++)
 719             if (state_set_index(set2, set1->states[i]) == -1)
 720                 return 0;
 721         return 1;
 722     }
 723 }
 724
 725 #if 0
 726 static void state_set_compact(struct state_set *set) {
 727     while (set->used > 0 && set->states[set->used] == NULL)
 728         set->used -= 1;
 729     for (int i=0; i < set->used; i++) {
 730         if (set->states[i] == NULL) {
 731             set->used -= 1;
 732             set->states[i] = set->states[set->used];
 733             if (set->data)
 734                 set->data[i] = set->data[set->used];
 735         }
 736         while (set->used > 0 && set->states[set->used] == NULL)
 737             set->used -= 1;
 738     }
 739 }
 740 #endif
 741
 742 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
 743    stored in SET->DATA at the same index.
 744 */
 745 ATTRIBUTE_RETURN_CHECK
 746 static int state_pair_push(struct state_set **set,
 747                            struct state *fst, struct state *snd) {
 748     if (*set == NULL)
 749         *set = state_set_init(-1, S_DATA);
 750     if (*set == NULL)
 751         return -1;
 752     int i = state_set_push(*set, fst);
 753     if (i == -1)
 754         return -1;
 755     (*set)->data[i] = snd;
 756
 757     return 0;
 758 }
 759
 760 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
 761    such pair.
 762  */
 763 static int state_pair_find(struct state_set *set, struct state *fst,
 764                            struct state *snd) {
 765     for (int i=0; i < set->used; i++)
 766         if (set->states[i] == fst && set->data[i] == snd)
 767             return i;
 768     return -1;
 769 }
 770
 771 /* Jenkins' hash for void* */
 772 static hash_val_t ptr_hash(const void *p) {
 773     hash_val_t hash = 0;
 774     char *c = (char *) &p;
 775     for (int i=0; i < sizeof(p); i++) {
 776         hash += c[i];
 777         hash += (hash << 10);
 778         hash ^= (hash >> 6);
 779     }
 780     hash += (hash << 3);
 781     hash ^= (hash >> 11);
 782     hash += (hash << 15);
 783     return hash;
 784 }
 785
 786 typedef hash_t state_triple_hash;
 787
 788 static hash_val_t pair_hash(const void *key) {
 789     register struct state *const *pair = key;
 790     return pair[0]->hash + pair[1]->hash;
 791 }
 792
 793 static int pair_cmp(const void *key1, const void *key2) {
 794     return memcmp(key1, key2, 2*sizeof(struct state *));
 795 }
 796
 797 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
 798     free((void *) hnode_getkey(node));
 799     free(node);
 800 }
 801
 802 static state_triple_hash *state_triple_init(void) {
 803     state_triple_hash *hash;
 804
 805     hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
 806     if (hash == NULL)
 807         return NULL;
 808     hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
 809     return hash;
 810 }
 811
 812 ATTRIBUTE_RETURN_CHECK
 813 static int state_triple_push(state_triple_hash *hash,
 814                              struct state *s1,
 815                              struct state *s2,
 816                              struct state *s3) {
 817     struct state **pair;
 818     if (ALLOC_N(pair, 2) < 0)
 819         return -1;
 820     pair[0] = s1;
 821     pair[1] = s2;
 822     return hash_alloc_insert(hash, pair, s3);
 823 }
 824
 825 static struct state * state_triple_thd(state_triple_hash *hash,
 826                                        struct state *s1,
 827                                        struct state *s2) {
 828     struct state *pair[2];
 829     hnode_t *node;
 830     pair[0] = s1;
 831     pair[1] = s2;
 832     node = hash_lookup(hash, pair);
 833     return (node == NULL) ? NULL : (struct state *) hnode_get(node);
 834 }
 835
 836 static void state_triple_free(state_triple_hash *hash) {
 837     if (hash != NULL) {
 838         hash_free_nodes(hash);
 839         hash_destroy(hash);
 840     }
 841 }
 842
 843 /*
 844  * State operations
 845  */
 846 ATTRIBUTE_RETURN_CHECK
 847 static int mark_reachable(struct fa *fa) {
 848     struct state_set *worklist = state_set_init(-1, S_NONE);
 849     int result = -1;
 850
 851     E(worklist == NULL);
 852
 853     list_for_each(s, fa->initial) {
 854         s->reachable = 0;
 855     }
 856     fa->initial->reachable = 1;
 857
 858     for (struct state *s = fa->initial;
 859          s != NULL;
 860          s = state_set_pop(worklist)) {
 861         for_each_trans(t, s) {
 862             if (! t->to->reachable) {
 863                 t->to->reachable = 1;
 864                 F(state_set_push(worklist, t->to));
 865             }
 866         }
 867     }
 868     result = 0;
 869
 870  error:
 871     state_set_free(worklist);
 872     return result;
 873 }
 874
 875 /* Return all reachable states. As a sideeffect, all states have their
 876    REACHABLE flag set appropriately.
 877  */
 878 static struct state_set *fa_states(struct fa *fa) {
 879     struct state_set *visited = state_set_init(-1, S_NONE);
 880     int r;
 881
 882     r = mark_reachable(fa);
 883     E(visited == NULL || r < 0);
 884
 885     list_for_each(s, fa->initial) {
 886         if (s->reachable)
 887             F(state_set_push(visited, s));
 888     }
 889     return visited;
 890  error:
 891     state_set_free(visited);
 892     return NULL;
 893 }
 894
 895 /* Return all reachable accepting states. As a sideeffect, all states have
 896    their REACHABLE flag set appropriately.
 897  */
 898 static struct state_set *fa_accept_states(struct fa *fa) {
 899     struct state_set *accept = state_set_init(-1, S_NONE);
 900     int r;
 901
 902     E(accept == NULL);
 903
 904     r = mark_reachable(fa);
 905     E(r < 0);
 906
 907     list_for_each(s, fa->initial) {
 908         if (s->reachable && s->accept)
 909             F(state_set_push(accept, s));
 910     }
 911     return accept;
 912  error:
 913     state_set_free(accept);
 914     return NULL;
 915 }
 916
 917 /* Mark all live states, i.e. states from which an accepting state can be
 918    reached. All states have their REACHABLE and LIVE flags set
 919    appropriately.
 920  */
 921 ATTRIBUTE_RETURN_CHECK
 922 static int mark_live(struct fa *fa) {
 923     int changed;
 924
 925     F(mark_reachable(fa));
 926
 927     list_for_each(s, fa->initial) {
 928         s->live = s->reachable && s->accept;
 929     }
 930
 931     do {
 932         changed = 0;
 933         list_for_each(s, fa->initial) {
 934             if (! s->live && s->reachable) {
 935                 for_each_trans(t, s) {
 936                     if (t->to->live) {
 937                         s->live = 1;
 938                         changed = 1;
 939                         break;
 940                     }
 941                 }
 942             }
 943         }
 944     } while (changed);
 945     return 0;
 946  error:
 947     return -1;
 948 }
 949
 950 /*
 951  * Reverse an automaton in place. Change FA so that it accepts the
 952  * language that is the reverse of the input automaton.
 953  *
 954  * Returns a list of the new initial states of the automaton. The list must
 955  * be freed by the caller.
 956  */
 957 static struct state_set *fa_reverse(struct fa *fa) {
 958     struct state_set *all = NULL;
 959     struct state_set *accept = NULL;
 960     int r;
 961
 962     all = fa_states(fa);
 963     E(all == NULL);
 964     accept = fa_accept_states(fa);
 965     E(accept == NULL);
 966
 967     F(state_set_init_data(all));
 968
 969     /* Reverse all transitions */
 970     int *tused;
 971     F(ALLOC_N(tused, all->used));
 972     for (int i=0; i < all->used; i++) {
 973         all->data[i] = all->states[i]->trans;
 974         tused[i] = all->states[i]->tused;
 975         all->states[i]->trans = NULL;
 976         all->states[i]->tsize = 0;
 977         all->states[i]->tused = 0;
 978     }
 979     for (int i=0; i < all->used; i++) {
 980         struct state *s = all->states[i];
 981         struct trans *t = all->data[i];
 982         s->accept = 0;
 983         for (int j=0; j < tused[i]; j++) {
 984             r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
 985             if (r < 0)
 986                 goto error;
 987         }
 988         free(t);
 989     }
 990     free(tused);
 991
 992     /* Make new initial and final states */
 993     struct state *s = add_state(fa, 0);
 994     E(s == NULL);
 995
 996     fa->initial->accept = 1;
 997     set_initial(fa, s);
 998     for (int i=0; i < accept->used; i++) {
 999         r = add_epsilon_trans(s, accept->states[i]);
1000         if (r < 0)
1001             goto error;
1002     }
1003
1004     fa->deterministic = 0;
1005     fa->minimal = 0;
1006     state_set_free(all);
1007     return accept;
1008  error:
1009     state_set_free(all);
1010     state_set_free(accept);
1011     return NULL;
1012 }
1013
1014 /*
1015  * Return a sorted array of all interval start points in FA. The returned
1016  * array is a string (null terminated)
1017  */
1018 static uchar* start_points(struct fa *fa, int *npoints) {
1019     char pointset[UCHAR_NUM];
1020     uchar *points = NULL;
1021
1022     F(mark_reachable(fa));
1023     MEMZERO(pointset, UCHAR_NUM);
1024     list_for_each(s, fa->initial) {
1025         if (! s->reachable)
1026             continue;
1027         pointset[0] = 1;
1028         for_each_trans(t, s) {
1029             pointset[t->min] = 1;
1030             if (t->max < UCHAR_MAX)
1031                 pointset[t->max+1] = 1;
1032         }
1033     }
1034
1035     *npoints = 0;
1036     for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1037
1038     F(ALLOC_N(points, *npoints+1));
1039     for (int i=0, n=0; i < UCHAR_NUM; i++) {
1040         if (pointset[i])
1041             points[n++] = (uchar) i;
1042     }
1043
1044     return points;
1045  error:
1046     free(points);
1047     return NULL;
1048 }
1049
1050 /*
1051  * Operations on STATE_SET_HASH
1052  */
1053 static int state_set_hash_contains(state_set_hash *smap,
1054                                struct state_set *set) {
1055     return hash_lookup(smap, set) != NULL;
1056 }
1057
1058 /*
1059  * Find the set in SMAP that has the same states as SET. If the two are
1060  * different, i.e. they point to different memory locations, free SET and
1061  * return the set found in SMAP
1062  */
1063 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1064                                              struct state_set *set) {
1065     hnode_t *node = hash_lookup(smap, set);
1066     const struct state_set *orig_set = hnode_getkey(node);
1067     if (orig_set != set) {
1068         state_set_free(set);
1069     }
1070     return (struct state_set *) orig_set;
1071 }
1072
1073 static struct state *state_set_hash_get_state(state_set_hash *smap,
1074                                              struct state_set *set) {
1075     hnode_t *node = hash_lookup(smap, set);
1076     return (struct state *) hnode_get(node);
1077 }
1078
1079 static hash_val_t set_hash(const void *key) {
1080     hash_val_t hash = 0;
1081     const struct state_set *set = key;
1082
1083     for (int i = 0; i < set->used; i++) {
1084         hash += set->states[i]->hash;
1085     }
1086     return hash;
1087 }
1088
1089 static int set_cmp(const void *key1, const void *key2) {
1090     const struct state_set *set1 = key1;
1091     const struct state_set *set2 = key2;
1092
1093     return state_set_equal(set1, set2) ? 0 : 1;
1094 }
1095
1096 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1097     struct state_set *set = (struct state_set *) hnode_getkey(node);
1098     state_set_free(set);
1099     free(node);
1100 }
1101
1102 ATTRIBUTE_RETURN_CHECK
1103 static int state_set_hash_add(state_set_hash **smap,
1104                               struct state_set *set, struct fa *fa) {
1105     if (*smap == NULL) {
1106         *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1107         E(*smap == NULL);
1108         hash_set_allocator(*smap, NULL, set_destroy, NULL);
1109     }
1110     struct state *s = add_state(fa, 0);
1111     E(s == NULL);
1112     F(hash_alloc_insert(*smap, set, s));
1113     return 0;
1114  error:
1115     return -1;
1116 }
1117
1118 static void state_set_hash_free(state_set_hash *smap,
1119                                 struct state_set *protect) {
1120     if (protect != NULL) {
1121         hnode_t *node = hash_lookup(smap, protect);
1122         hash_delete(smap, node);
1123         hnode_getkey(node) = NULL;
1124         set_destroy(node, NULL);
1125     }
1126     hash_free_nodes(smap);
1127     hash_destroy(smap);
1128 }
1129
1130 static int state_set_list_add(struct state_set_list **list,
1131                               struct state_set *set) {
1132     struct state_set_list *elt;
1133     if (ALLOC(elt) < 0)
1134         return -1;
1135     elt->set = set;
1136     list_cons(*list, elt);
1137     return 0;
1138 }
1139
1140 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1141     struct state_set_list *elt = *list;
1142     struct state_set      *set = elt->set;
1143
1144     *list = elt->next;
1145     free(elt);
1146     return set;
1147 }
1148
1149 /* Compare transitions lexicographically by (to, min, reverse max) */
1150 static int trans_to_cmp(const void *v1, const void *v2) {
1151     const struct trans *t1 = v1;
1152     const struct trans *t2 = v2;
1153
1154     if (t1->to != t2->to) {
1155         return (t1->to < t2->to) ? -1 : 1;
1156     }
1157     if (t1->min < t2->min)
1158         return -1;
1159     if (t1->min > t2->min)
1160         return 1;
1161     if (t1->max > t2->max)
1162         return -1;
1163     return (t1->max < t2->max) ? 1 : 0;
1164 }
1165
1166 /* Compare transitions lexicographically by (min, reverse max, to) */
1167 static int trans_intv_cmp(const void *v1, const void *v2) {
1168     const struct trans *t1 = v1;
1169     const struct trans *t2 = v2;
1170
1171     if (t1->min < t2->min)
1172         return -1;
1173     if (t1->min > t2->min)
1174         return 1;
1175     if (t1->max > t2->max)
1176         return -1;
1177     if (t1->max < t2->max)
1178         return 1;
1179     if (t1->to != t2->to) {
1180         return (t1->to < t2->to) ? -1 : 1;
1181     }
1182     return 0;
1183 }
1184
1185 /*
1186  * Reduce an automaton by combining overlapping and adjacent edge intervals
1187  * with the same destination.
1188  */
1189 static void reduce(struct fa *fa) {
1190     list_for_each(s, fa->initial) {
1191         if (s->tused == 0)
1192             continue;
1193
1194         qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1195         int i=0, j=1;
1196         struct trans *t = s->trans;
1197         while (j < s->tused) {
1198             if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1199                 if (t[j].max > t[i].max)
1200                     t[i].max = t[j].max;
1201                 j += 1;
1202             } else {
1203                 i += 1;
1204                 if (i < j)
1205                     memmove(s->trans + i, s->trans + j,
1206                             sizeof(*s->trans) * (s->tused - j));
1207                 s->tused -= j - i;
1208                 j = i + 1;
1209             }
1210         }
1211         s->tused = i+1;
1212         /* Shrink if we use less than half the allocated size */
1213         if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1214             int r;
1215             r = REALLOC_N(s->trans, s->tsize);
1216             if (r == 0)
1217                 s->tsize = s->tused;
1218         }
1219     }
1220 }
1221
1222 /*
1223  * Remove dead transitions from an FA; a transition is dead is it does not
1224  * lead to a live state. This also removes any states that are not
1225  * reachable any longer from FA->INITIAL.
1226  *
1227  * Returns the same FA as a convenience
1228  */
1229
1230 static void collect_trans(struct fa *fa) {
1231     list_for_each(s, fa->initial) {
1232         if (! s->live) {
1233             free_trans(s);
1234         } else {
1235             int i=0;
1236             while (i < s->tused) {
1237                 if (! s->trans[i].to->live) {
1238                     s->tused -= 1;
1239                     memmove(s->trans + i, s->trans + s->tused,
1240                             sizeof(*s->trans));
1241                 } else {
1242                     i += 1;
1243                 }
1244             }
1245         }
1246     }
1247 }
1248
1249 static void collect_dead_states(struct fa *fa) {
1250     /* Remove all dead states and free their storage */
1251     for (struct state *s = fa->initial; s->next != NULL; ) {
1252         if (! s->next->live) {
1253             struct state *del = s->next;
1254             s->next = del->next;
1255             free_trans(del);
1256             free(del);
1257         } else {
1258             s = s->next;
1259         }
1260     }
1261 }
1262
1263 static int collect(struct fa *fa) {
1264     F(mark_live(fa));
1265
1266     if (! fa->initial->live) {
1267         /* This automaton accepts nothing, make it the canonical
1268          * epsilon automaton
1269          */
1270         list_for_each(s, fa->initial) {
1271             free_trans(s);
1272         }
1273         list_free(fa->initial->next);
1274         fa->deterministic = 1;
1275     } else {
1276         collect_trans(fa);
1277         collect_dead_states(fa);
1278     }
1279     reduce(fa);
1280     return 0;
1281  error:
1282     return -1;
1283 }
1284
1285 static void swap_initial(struct fa *fa) {
1286     struct state *s = fa->initial;
1287     if (s->next != NULL) {
1288         fa->initial = s->next;
1289         s->next = fa->initial->next;
1290         fa->initial->next = s;
1291     }
1292 }
1293
1294 /*
1295  * Make a finite automaton deterministic using the given set of initial
1296  * states with the subset construction. This also eliminates dead states
1297  * and transitions and reduces and orders the transitions for each state
1298  */
1299 static int determinize(struct fa *fa, struct state_set *ini) {
1300     int npoints;
1301     int make_ini = (ini == NULL);
1302     const uchar *points = NULL;
1303     state_set_hash *newstate = NULL;
1304     struct state_set_list *worklist = NULL;
1305     int ret = 0;
1306
1307     if (fa->deterministic)
1308         return 0;
1309
1310     points = start_points(fa, &npoints);
1311     E(points == NULL);
1312     if (make_ini) {
1313         ini = state_set_init(-1, S_NONE);
1314         if (ini == NULL || state_set_push(ini, fa->initial) < 0)
1315             goto error;
1316     }
1317
1318     F(state_set_list_add(&worklist, ini));
1319     F(state_set_hash_add(&newstate, ini, fa));
1320     // Make the new state the initial state
1321     swap_initial(fa);
1322     while (worklist != NULL) {
1323         struct state_set *sset = state_set_list_pop(&worklist);
1324         struct state *r = state_set_hash_get_state(newstate, sset);
1325         for (int q=0; q < sset->used; q++) {
1326             r->accept |= sset->states[q]->accept;
1327         }
1328         for (int n=0; n < npoints; n++) {
1329             struct state_set *pset = state_set_init(-1, S_SORTED);
1330             E(pset == NULL);
1331             for(int q=0 ; q < sset->used; q++) {
1332                 for_each_trans(t, sset->states[q]) {
1333                     if (t->min <= points[n] && points[n] <= t->max) {
1334                         F(state_set_add(pset, t->to));
1335                     }
1336                 }
1337             }
1338             if (!state_set_hash_contains(newstate, pset)) {
1339                 F(state_set_list_add(&worklist, pset));
1340                 F(state_set_hash_add(&newstate, pset, fa));
1341             }
1342             pset = state_set_hash_uniq(newstate, pset);
1343
1344             struct state *q = state_set_hash_get_state(newstate, pset);
1345             uchar min = points[n];
1346             uchar max = UCHAR_MAX;
1347             if (n+1 < npoints)
1348                 max = points[n+1] - 1;
1349             if (add_new_trans(r, q, min, max) < 0)
1350                 goto error;
1351         }
1352     }
1353     fa->deterministic = 1;
1354
1355  done:
1356     if (newstate)
1357         state_set_hash_free(newstate, make_ini ? NULL : ini);
1358     free((void *) points);
1359     if (collect(fa) < 0)
1360         ret = -1;
1361     return ret;
1362  error:
1363     ret = -1;
1364     goto done;
1365 }
1366
1367 /*
1368  * Minimization. As a sideeffect of minimization, the transitions are
1369  * reduced and ordered.
1370  */
1371
1372 static struct state *step(struct state *s, uchar c) {
1373     for_each_trans(t, s) {
1374         if (t->min <= c && c <= t->max)
1375             return t->to;
1376     }
1377     return NULL;
1378 }
1379
1380 struct state_list {
1381     struct state_list_node *first;
1382     struct state_list_node *last;
1383     unsigned int            size;
1384 };
1385
1386 struct state_list_node {
1387     struct state_list      *sl;
1388     struct state_list_node *next;
1389     struct state_list_node *prev;
1390     struct state           *state;
1391 };
1392
1393 static struct state_list_node *state_list_add(struct state_list *sl,
1394                                               struct state *s) {
1395     struct state_list_node *n;
1396
1397     if (ALLOC(n) < 0)
1398         return NULL;
1399
1400     n->state = s;
1401     n->sl = sl;
1402
1403     if (sl->size++ == 0) {
1404         sl->first = n;
1405         sl->last = n;
1406     } else {
1407         sl->last->next = n;
1408         n->prev = sl->last;
1409         sl->last = n;
1410     }
1411     return n;
1412 }
1413
1414 static void state_list_remove(struct state_list_node *n) {
1415     struct state_list *sl = n->sl;
1416     sl->size -= 1;
1417     if (sl->first == n)
1418         sl->first = n->next;
1419     else
1420         n->prev->next = n->next;
1421     if (sl->last == n)
1422         sl->last = n->prev;
1423     else
1424         n->next->prev = n->prev;
1425
1426     free(n);
1427 }
1428
1429 static void state_list_free(struct state_list *sl) {
1430     if (sl)
1431         list_free(sl->first);
1432     free(sl);
1433 }
1434
1435 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1436 #define INDEX(q, c) (q * nsigma + c)
1437
1438 static int minimize_hopcroft(struct fa *fa) {
1439     struct state_set *states = NULL;
1440     uchar *sigma = NULL;
1441     struct state_set **reverse = NULL;
1442     bitset *reverse_nonempty = NULL;
1443     struct state_set **partition = NULL;
1444     unsigned int *block = NULL;
1445     struct state_list **active = NULL;
1446     struct state_list_node **active2 = NULL;
1447     int *pending = NULL;
1448     bitset *pending2 = NULL;
1449     struct state_set *split = NULL;
1450     bitset *split2 = NULL;
1451     int *refine = NULL;
1452     bitset *refine2 = NULL;
1453     struct state_set **splitblock = NULL;
1454     struct state_set *newstates = NULL;
1455     int *nsnum = NULL;
1456     int *nsind = NULL;
1457     int result = -1;
1458     unsigned int nstates = 0;
1459     int nsigma = 0;
1460
1461     F(determinize(fa, NULL));
1462
1463     /* Total automaton, nothing to do */
1464     if (fa->initial->tused == 1
1465         && fa->initial->trans[0].to == fa->initial
1466         && fa->initial->trans[0].min == UCHAR_MIN
1467         && fa->initial->trans[0].max == UCHAR_MAX)
1468         return 0;
1469
1470     F(totalize(fa));
1471
1472     /* make arrays for numbered states and effective alphabet */
1473     states = state_set_init(-1, S_NONE);
1474     E(states == NULL);
1475
1476     list_for_each(s, fa->initial) {
1477         F(state_set_push(states, s));
1478     }
1479     nstates = states->used;
1480
1481     sigma = start_points(fa, &nsigma);
1482     E(sigma == NULL);
1483
1484     /* initialize data structures */
1485
1486     /* An ss->used x nsigma matrix of lists of states */
1487     F(ALLOC_N(reverse, nstates * nsigma));
1488     reverse_nonempty = bitset_init(nstates * nsigma);
1489     E(reverse_nonempty == NULL);
1490     F(ALLOC_N(partition, nstates));
1491     F(ALLOC_N(block, nstates));
1492
1493     F(ALLOC_N(active, nstates * nsigma));
1494     F(ALLOC_N(active2, nstates * nsigma));
1495
1496     /* PENDING is an array of pairs of ints. The i'th pair is stored in
1497      * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1498      * PENDING at any time. SPENDING is the maximum number of pairs
1499      * allocated for PENDING.
1500      */
1501     size_t npending = 0, spending = 0;
1502     pending2 = bitset_init(nstates * nsigma);
1503     E(pending2 == NULL);
1504
1505     split = state_set_init(-1, S_NONE);
1506     split2 = bitset_init(nstates);
1507     E(split == NULL || split2 == NULL);
1508
1509     F(ALLOC_N(refine, nstates));
1510     refine2 = bitset_init(nstates);
1511     E(refine2 == NULL);
1512
1513     F(ALLOC_N(splitblock, nstates));
1514
1515     for (int q = 0; q < nstates; q++) {
1516         splitblock[q] = state_set_init(-1, S_NONE);
1517         partition[q] = state_set_init(-1, S_NONE);
1518         E(splitblock[q] == NULL || partition[q] == NULL);
1519         for (int x = 0; x < nsigma; x++) {
1520             reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1521             E(reverse[INDEX(q, x)] == NULL);
1522             F(ALLOC_N(active[INDEX(q, x)], 1));
1523         }
1524     }
1525
1526     /* find initial partition and reverse edges */
1527     for (int q = 0; q < nstates; q++) {
1528         struct state *qq = states->states[q];
1529         int j;
1530         if (qq->accept)
1531             j = 0;
1532         else
1533             j = 1;
1534         F(state_set_push(partition[j], qq));
1535         block[q] = j;
1536         for (int x = 0; x < nsigma; x++) {
1537             uchar y = sigma[x];
1538             struct state *p = step(qq, y);
1539             assert(p != NULL);
1540             int pn = state_set_index(states, p);
1541             assert(pn >= 0);
1542             F(state_set_push(reverse[INDEX(pn, x)], qq));
1543             bitset_set(reverse_nonempty, INDEX(pn, x));
1544         }
1545     }
1546
1547     /* initialize active sets */
1548     for (int j = 0; j <= 1; j++)
1549         for (int x = 0; x < nsigma; x++)
1550             for (int q = 0; q < partition[j]->used; q++) {
1551                 struct state *qq = partition[j]->states[q];
1552                 int qn = state_set_index(states, qq);
1553                 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1554                     active2[INDEX(qn, x)] =
1555                         state_list_add(active[INDEX(j, x)], qq);
1556                     E(active2[INDEX(qn, x)] == NULL);
1557                 }
1558             }
1559
1560     /* initialize pending */
1561     F(ALLOC_N(pending, 2*nsigma));
1562     npending = nsigma;
1563     spending = nsigma;
1564     for (int x = 0; x < nsigma; x++) {
1565         int a0 = active[INDEX(0,x)]->size;
1566         int a1 = active[INDEX(1,x)]->size;
1567         int j;
1568         if (a0 <= a1)
1569             j = 0;
1570         else
1571             j = 1;
1572         pending[2*x] = j;
1573         pending[2*x+1] = x;
1574         bitset_set(pending2, INDEX(j, x));
1575     }
1576
1577     /* process pending until fixed point */
1578     int k = 2;
1579     while (npending-- > 0) {
1580         int p = pending[2*npending];
1581         int x = pending[2*npending+1];
1582         bitset_clr(pending2, INDEX(p, x));
1583         int ref = 0;
1584         /* find states that need to be split off their blocks */
1585         struct state_list *sh = active[INDEX(p,x)];
1586         for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1587             int q = state_set_index(states, m->state);
1588             struct state_set *rev = reverse[INDEX(q, x)];
1589             for (int r =0; r < rev->used; r++) {
1590                 struct state *rs = rev->states[r];
1591                 int s = state_set_index(states, rs);
1592                 if (! bitset_get(split2, s)) {
1593                     bitset_set(split2, s);
1594                     F(state_set_push(split, rs));
1595                     int j = block[s];
1596                     F(state_set_push(splitblock[j], rs));
1597                     if (!bitset_get(refine2, j)) {
1598                         bitset_set(refine2, j);
1599                         refine[ref++] = j;
1600                     }
1601                 }
1602             }
1603         }
1604         // refine blocks
1605         for(int rr=0; rr < ref; rr++) {
1606             int j = refine[rr];
1607             struct state_set *sp = splitblock[j];
1608             if (sp->used < partition[j]->used) {
1609                 struct state_set *b1 = partition[j];
1610                 struct state_set *b2 = partition[k];
1611                 for (int s = 0; s < sp->used; s++) {
1612                     state_set_remove(b1, sp->states[s]);
1613                     F(state_set_push(b2, sp->states[s]));
1614                     int snum = state_set_index(states, sp->states[s]);
1615                     block[snum] = k;
1616                     for (int c = 0; c < nsigma; c++) {
1617                         struct state_list_node *sn = active2[INDEX(snum, c)];
1618                         if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1619                             state_list_remove(sn);
1620                             active2[INDEX(snum, c)] =
1621                                 state_list_add(active[INDEX(k, c)],
1622                                                sp->states[s]);
1623                             E(active2[INDEX(snum, c)] == NULL);
1624                         }
1625                     }
1626                 }
1627                 // update pending
1628                 for (int c = 0; c < nsigma; c++) {
1629                     int aj = active[INDEX(j, c)]->size;
1630                     int ak = active[INDEX(k, c)]->size;
1631                     if (npending + 1 > spending) {
1632                         spending *= 2;
1633                         F(REALLOC_N(pending, 2 * spending));
1634                     }
1635                     pending[2*npending + 1] = c;
1636                     if (!bitset_get(pending2, INDEX(j, c))
1637                         && 0 < aj && aj <= ak) {
1638                         bitset_set(pending2, INDEX(j, c));
1639                         pending[2*npending] = j;
1640                     } else {
1641                         bitset_set(pending2, INDEX(k, c));
1642                         pending[2*npending] = k;
1643                     }
1644                     npending += 1;
1645                 }
1646                 k++;
1647             }
1648             for (int s = 0; s < sp->used; s++) {
1649                 int snum = state_set_index(states, sp->states[s]);
1650                 bitset_clr(split2, snum);
1651             }
1652             bitset_clr(refine2, j);
1653             sp->used = 0;
1654         }
1655         split->used = 0;
1656     }
1657
1658     /* make a new state for each equivalence class, set initial state */
1659     newstates = state_set_init(k, S_NONE);
1660     E(newstates == NULL);
1661     F(ALLOC_N(nsnum, k));
1662     F(ALLOC_N(nsind, nstates));
1663
1664     for (int n = 0; n < k; n++) {
1665         struct state *s = make_state();
1666         E(s == NULL);
1667         newstates->states[n] = s;
1668         struct state_set *partn = partition[n];
1669         for (int q=0; q < partn->used; q++) {
1670             struct state *qs = partn->states[q];
1671             int qnum = state_set_index(states, qs);
1672             if (qs == fa->initial)
1673                 s->live = 1;     /* Abuse live to flag the new intial state */
1674             nsnum[n] = qnum;     /* select representative */
1675             nsind[qnum] = n;     /* and point from partition to new state */
1676         }
1677     }
1678
1679     /* build transitions and set acceptance */
1680     for (int n = 0; n < k; n++) {
1681         struct state *s = newstates->states[n];
1682         s->accept = states->states[nsnum[n]]->accept;
1683         for_each_trans(t, states->states[nsnum[n]]) {
1684             int toind = state_set_index(states, t->to);
1685             struct state *nto = newstates->states[nsind[toind]];
1686             F(add_new_trans(s, nto, t->min, t->max));
1687         }
1688     }
1689
1690     /* Get rid of old states and transitions and turn NEWTSTATES into
1691        a linked list */
1692     gut(fa);
1693     for (int n=0; n < k; n++)
1694         if (newstates->states[n]->live) {
1695             struct state *ini = newstates->states[n];
1696             newstates->states[n] = newstates->states[0];
1697             newstates->states[0] = ini;
1698         }
1699     for (int n=0; n < k-1; n++)
1700         newstates->states[n]->next = newstates->states[n+1];
1701     fa->initial = newstates->states[0];
1702
1703     result = 0;
1704
1705     /* clean up */
1706  done:
1707     free(nsind);
1708     free(nsnum);
1709     state_set_free(states);
1710     free(sigma);
1711     bitset_free(reverse_nonempty);
1712     free(block);
1713     for (int i=0; i < nstates*nsigma; i++) {
1714         if (reverse)
1715             state_set_free(reverse[i]);
1716         if (active)
1717             state_list_free(active[i]);
1718     }
1719     free(reverse);
1720     free(active);
1721     free(active2);
1722     free(pending);
1723     bitset_free(pending2);
1724     state_set_free(split);
1725     bitset_free(split2);
1726     free(refine);
1727     bitset_free(refine2);
1728     for (int q=0; q < nstates; q++) {
1729         if (splitblock)
1730             state_set_free(splitblock[q]);
1731         if (partition)
1732             state_set_free(partition[q]);
1733     }
1734     free(splitblock);
1735     free(partition);
1736     state_set_free(newstates);
1737
1738     if (collect(fa) < 0)
1739         result = -1;
1740     return result;
1741  error:
1742     result = -1;
1743     goto done;
1744 }
1745
1746 static int minimize_brzozowski(struct fa *fa) {
1747     struct state_set *set;
1748
1749     /* Minimize using Brzozowski's algorithm */
1750     set = fa_reverse(fa);
1751     E(set == NULL);
1752     F(determinize(fa, set));
1753     state_set_free(set);
1754
1755     set = fa_reverse(fa);
1756     E(set == NULL);
1757     F(determinize(fa, set));
1758     state_set_free(set);
1759     return 0;
1760  error:
1761     return -1;
1762 }
1763
1764 int fa_minimize(struct fa *fa) {
1765     int r;
1766
1767     if (fa == NULL)
1768         return -1;
1769     if (fa->minimal)
1770         return 0;
1771
1772     if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1773         r = minimize_brzozowski(fa);
1774     } else {
1775         r = minimize_hopcroft(fa);
1776     }
1777
1778     if (r == 0)
1779         fa->minimal = 1;
1780     return r;
1781 }
1782
1783 /*
1784  * Construction of fa
1785  */
1786
1787 static struct fa *fa_make_empty(void) {
1788     struct fa *fa;
1789
1790     if (ALLOC(fa) < 0)
1791         return NULL;
1792     if (add_state(fa, 0) == NULL) {
1793         fa_free(fa);
1794         return NULL;
1795     }
1796     fa->deterministic = 1;
1797     fa->minimal = 1;
1798     return fa;
1799 }
1800
1801 static struct fa *fa_make_epsilon(void) {
1802     struct fa *fa = fa_make_empty();
1803     if (fa) {
1804         fa->initial->accept = 1;
1805         fa->deterministic= 1;
1806         fa->minimal = 1;
1807     }
1808     return fa;
1809 }
1810
1811 static struct fa *fa_make_char(uchar c) {
1812     struct fa *fa = fa_make_empty();
1813     if (! fa)
1814         return NULL;
1815
1816     struct state *s = fa->initial;
1817     struct state *t = add_state(fa, 1);
1818     int r;
1819
1820     if (t == NULL)
1821         goto error;
1822
1823     r = add_new_trans(s, t, c, c);
1824     if (r < 0)
1825         goto error;
1826     fa->deterministic = 1;
1827     fa->minimal = 1;
1828     return fa;
1829  error:
1830     fa_free(fa);
1831     return NULL;
1832 }
1833
1834 struct fa *fa_make_basic(unsigned int basic) {
1835     int r;
1836
1837     if (basic == FA_EMPTY) {
1838         return fa_make_empty();
1839     } else if (basic == FA_EPSILON) {
1840         return fa_make_epsilon();
1841     } else if (basic == FA_TOTAL) {
1842         struct fa *fa = fa_make_epsilon();
1843         r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1844         if (r < 0) {
1845             fa_free(fa);
1846             fa = NULL;
1847         }
1848         return fa;
1849     }
1850     return NULL;
1851 }
1852
1853 int fa_is_basic(struct fa *fa, unsigned int basic) {
1854     if (basic == FA_EMPTY) {
1855         return ! fa->initial->accept && fa->initial->tused == 0;
1856     } else if (basic == FA_EPSILON) {
1857         return fa->initial->accept && fa->initial->tused == 0;
1858     } else if (basic == FA_TOTAL) {
1859         if (! fa->initial->accept)
1860             return 0;
1861         if (fa->nocase) {
1862             if (fa->initial->tused != 2)
1863                 return 0;
1864             struct trans *t1 = fa->initial->trans;
1865             struct trans *t2 = fa->initial->trans + 1;
1866             if (t1->to != fa->initial || t2->to != fa->initial)
1867                 return 0;
1868             if (t2->max != UCHAR_MAX) {
1869                 t1 = t2;
1870                 t2 = fa->initial->trans;
1871             }
1872             return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1873                     t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1874         } else {
1875             struct trans *t = fa->initial->trans;
1876             return fa->initial->tused == 1 &&
1877                 t->to == fa->initial &&
1878                 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1879         }
1880     }
1881     return 0;
1882 }
1883
1884 static struct fa *fa_clone(struct fa *fa) {
1885     struct fa *result = NULL;
1886     struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1887     int r;
1888
1889     if (fa == NULL || set == NULL || ALLOC(result) < 0)
1890         goto error;
1891
1892     result->deterministic = fa->deterministic;
1893     result->minimal = fa->minimal;
1894     result->nocase = fa->nocase;
1895     list_for_each(s, fa->initial) {
1896         int i = state_set_push(set, s);
1897         E(i < 0);
1898
1899         struct state *q = add_state(result, s->accept);
1900         if (q == NULL)
1901             goto error;
1902         set->data[i] = q;
1903         q->live = s->live;
1904         q->reachable = s->reachable;
1905     }
1906     for (int i=0; i < set->used; i++) {
1907         struct state *s = set->states[i];
1908         struct state *sc = set->data[i];
1909         for_each_trans(t, s) {
1910             int to = state_set_index(set, t->to);
1911             assert(to >= 0);
1912             struct state *toc = set->data[to];
1913             r = add_new_trans(sc, toc, t->min, t->max);
1914             if (r < 0)
1915                 goto error;
1916         }
1917     }
1918     state_set_free(set);
1919     return result;
1920  error:
1921     state_set_free(set);
1922     fa_free(result);
1923     return NULL;
1924 }
1925
1926 static int case_expand(struct fa *fa);
1927
1928 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1929 ATTRIBUTE_RETURN_CHECK
1930 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1931     struct state *s;
1932     int r;
1933
1934     if (fa1->nocase != (*fa2)->nocase) {
1935         if (case_expand(fa1) < 0)
1936             return -1;
1937         if (case_expand(*fa2) < 0)
1938             return -1;
1939     }
1940
1941     s = add_state(fa1, 0);
1942     if (s == NULL)
1943         return -1;
1944     r = add_epsilon_trans(s, fa1->initial);
1945     if (r < 0)
1946         return -1;
1947     r = add_epsilon_trans(s, (*fa2)->initial);
1948     if (r < 0)
1949         return -1;
1950
1951     fa1->deterministic = 0;
1952     fa1->minimal = 0;
1953     fa_merge(fa1, fa2);
1954
1955     set_initial(fa1, s);
1956
1957     return 0;
1958 }
1959
1960 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1961     fa1 = fa_clone(fa1);
1962     fa2 = fa_clone(fa2);
1963     if (fa1 == NULL || fa2 == NULL)
1964         goto error;
1965
1966     F(union_in_place(fa1, &fa2));
1967
1968     return fa1;
1969  error:
1970     fa_free(fa1);
1971     fa_free(fa2);
1972     return NULL;
1973 }
1974
1975 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1976 ATTRIBUTE_RETURN_CHECK
1977 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1978     int r;
1979
1980     if (fa1->nocase != (*fa2)->nocase) {
1981         if (case_expand(fa1) < 0)
1982             return -1;
1983         if (case_expand(*fa2) < 0)
1984             return -1;
1985     }
1986
1987     list_for_each(s, fa1->initial) {
1988         if (s->accept) {
1989             s->accept = 0;
1990             r = add_epsilon_trans(s, (*fa2)->initial);
1991             if (r < 0)
1992                 return -1;
1993         }
1994     }
1995
1996     fa1->deterministic = 0;
1997     fa1->minimal = 0;
1998     fa_merge(fa1, fa2);
1999
2000     return 0;
2001 }
2002
2003 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2004     fa1 = fa_clone(fa1);
2005     fa2 = fa_clone(fa2);
2006
2007     if (fa1 == NULL || fa2 == NULL)
2008         goto error;
2009
2010     F(concat_in_place(fa1, &fa2));
2011
2012     F(collect(fa1));
2013
2014     return fa1;
2015
2016  error:
2017     fa_free(fa1);
2018     fa_free(fa2);
2019     return NULL;
2020 }
2021
2022 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2023     struct fa *fa = fa_make_empty();
2024     if (!fa)
2025         return NULL;
2026
2027     struct state *s = fa->initial;
2028     struct state *t = add_state(fa, 1);
2029     int from = 0;
2030     int r;
2031
2032     if (t == NULL)
2033         goto error;
2034
2035     while (from <= UCHAR_MAX) {
2036         while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2037             from += 1;
2038         if (from > UCHAR_MAX)
2039             break;
2040         int to = from;
2041         while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2042             to += 1;
2043         r = add_new_trans(s, t, from, to);
2044         if (r < 0)
2045             goto error;
2046         from = to + 1;
2047     }
2048
2049     fa->deterministic = 1;
2050     fa->minimal = 1;
2051     return fa;
2052
2053  error:
2054     fa_free(fa);
2055     return NULL;
2056 }
2057
2058 static struct fa *fa_star(struct fa *fa) {
2059     struct state *s;
2060     int r;
2061
2062     fa = fa_clone(fa);
2063     if (fa == NULL)
2064         return NULL;
2065
2066     s = add_state(fa, 1);
2067     if (s == NULL)
2068         goto error;
2069
2070     r = add_epsilon_trans(s, fa->initial);
2071     if (r < 0)
2072         goto error;
2073
2074     set_initial(fa, s);
2075     list_for_each(p, fa->initial->next) {
2076         if (p->accept) {
2077             r = add_epsilon_trans(p, s);
2078             if (r < 0)
2079                 goto error;
2080         }
2081     }
2082     fa->deterministic = 0;
2083     fa->minimal = 0;
2084
2085     return fa;
2086
2087  error:
2088     fa_free(fa);
2089     return NULL;
2090 }
2091
2092 /* Form the automaton (FA){N}; FA is not modified */
2093 static struct fa *repeat(struct fa *fa, int n) {
2094     if (n == 0) {
2095         return fa_make_epsilon();
2096     } else if (n == 1) {
2097         return fa_clone(fa);
2098     } else {
2099         struct fa *cfa = fa_clone(fa);
2100         if (cfa == NULL)
2101             return NULL;
2102         while (n > 1) {
2103             struct fa *tfa = fa_clone(fa);
2104             if (tfa == NULL) {
2105                 fa_free(cfa);
2106                 return NULL;
2107             }
2108             if (concat_in_place(cfa, &tfa) < 0) {
2109                 fa_free(cfa);
2110                 fa_free(tfa);
2111                 return NULL;
2112             }
2113             n -= 1;
2114         }
2115         return cfa;
2116     }
2117 }
2118
2119 struct fa *fa_iter(struct fa *fa, int min, int max) {
2120     int r;
2121
2122     if (min < 0)
2123         min = 0;
2124
2125     if (min > max && max != -1) {
2126         return fa_make_empty();
2127     }
2128     if (max == -1) {
2129         struct fa *sfa = fa_star(fa);
2130         if (min == 0)
2131             return sfa;
2132         if (! sfa)
2133             return NULL;
2134         struct fa *cfa = repeat(fa, min);
2135         if (! cfa) {
2136             fa_free(sfa);
2137             return NULL;
2138         }
2139         if (concat_in_place(cfa, &sfa) < 0) {
2140             fa_free(sfa);
2141             fa_free(cfa);
2142             return NULL;
2143         }
2144         return cfa;
2145     } else {
2146         struct fa *cfa = NULL;
2147
2148         max -= min;
2149         cfa = repeat(fa, min);
2150         if (cfa == NULL)
2151             return NULL;
2152         if (max > 0) {
2153             struct fa *cfa2 = fa_clone(fa);
2154             if (cfa2 == NULL) {
2155                 fa_free(cfa);
2156                 return NULL;
2157             }
2158             while (max > 1) {
2159                 struct fa *cfa3 = fa_clone(fa);
2160                 if (cfa3 == NULL) {
2161                     fa_free(cfa);
2162                     fa_free(cfa2);
2163                     return NULL;
2164                 }
2165                 list_for_each(s, cfa3->initial) {
2166                     if (s->accept) {
2167                         r = add_epsilon_trans(s, cfa2->initial);
2168                         if (r < 0) {
2169                             fa_free(cfa);
2170                             fa_free(cfa2);
2171                             fa_free(cfa3);
2172                             return NULL;
2173                         }
2174                     }
2175                 }
2176                 fa_merge(cfa3, &cfa2);
2177                 cfa2 = cfa3;
2178                 max -= 1;
2179             }
2180             list_for_each(s, cfa->initial) {
2181                 if (s->accept) {
2182                     r = add_epsilon_trans(s, cfa2->initial);
2183                     if (r < 0) {
2184                         fa_free(cfa);
2185                         fa_free(cfa2);
2186                         return NULL;
2187                     }
2188                 }
2189             }
2190             fa_merge(cfa, &cfa2);
2191             cfa->deterministic = 0;
2192             cfa->minimal = 0;
2193         }
2194         if (collect(cfa) < 0) {
2195             fa_free(cfa);
2196             cfa = NULL;
2197         }
2198         return cfa;
2199     }
2200 }
2201
2202 static void sort_transition_intervals(struct fa *fa) {
2203     list_for_each(s, fa->initial) {
2204         qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2205     }
2206 }
2207
2208 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2209     int ret;
2210     struct fa *fa = NULL;
2211     struct state_set *worklist = NULL;
2212     state_triple_hash *newstates = NULL;
2213
2214     if (fa1 == fa2)
2215         return fa_clone(fa1);
2216
2217     if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2218         return fa_make_empty();
2219
2220     if (fa1->nocase != fa2->nocase) {
2221         F(case_expand(fa1));
2222         F(case_expand(fa2));
2223     }
2224
2225     fa = fa_make_empty();
2226     worklist = state_set_init(-1, S_NONE);
2227     newstates = state_triple_init();
2228     if (fa == NULL || worklist == NULL || newstates == NULL)
2229         goto error;
2230
2231     sort_transition_intervals(fa1);
2232     sort_transition_intervals(fa2);
2233
2234     F(state_set_push(worklist, fa1->initial));
2235     F(state_set_push(worklist, fa2->initial));
2236     F(state_set_push(worklist, fa->initial));
2237     F(state_triple_push(newstates,
2238                          fa1->initial, fa2->initial, fa->initial));
2239     while (worklist->used) {
2240         struct state *s  = state_set_pop(worklist);
2241         struct state *p2 = state_set_pop(worklist);
2242         struct state *p1 = state_set_pop(worklist);
2243         s->accept = p1->accept && p2->accept;
2244
2245         struct trans *t1 = p1->trans;
2246         struct trans *t2 = p2->trans;
2247         for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2248             while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2249                 b2++;
2250             for (int n2 = b2;
2251                  n2 < p2->tused && t1[n1].max >= t2[n2].min;
2252                  n2++) {
2253                 if (t2[n2].max >= t1[n1].min) {
2254                     struct state *r = state_triple_thd(newstates,
2255                                                        t1[n1].to, t2[n2].to);
2256                     if (r == NULL) {
2257                         r = add_state(fa, 0);
2258                         E(r == NULL);
2259                         F(state_set_push(worklist, t1[n1].to));
2260                         F(state_set_push(worklist, t2[n2].to));
2261                         F(state_set_push(worklist, r));
2262                         F(state_triple_push(newstates,
2263                                              t1[n1].to, t2[n2].to, r));
2264                     }
2265                     char min = t1[n1].min > t2[n2].min
2266                         ? t1[n1].min : t2[n2].min;
2267                     char max = t1[n1].max < t2[n2].max
2268                         ? t1[n1].max : t2[n2].max;
2269                     ret = add_new_trans(s, r, min, max);
2270                     if (ret < 0)
2271                         goto error;
2272                 }
2273             }
2274         }
2275     }
2276     fa->deterministic = fa1->deterministic && fa2->deterministic;
2277     fa->nocase = fa1->nocase && fa2->nocase;
2278  done:
2279     state_set_free(worklist);
2280     state_triple_free(newstates);
2281     if (fa != NULL) {
2282         if (collect(fa) < 0) {
2283             fa_free(fa);
2284             fa = NULL;
2285         }
2286     }
2287
2288     return fa;
2289  error:
2290     fa_free(fa);
2291     fa = NULL;
2292     goto done;
2293 }
2294
2295 int fa_contains(struct fa *fa1, struct fa *fa2) {
2296     int result = 0;
2297     struct state_set *worklist = NULL;  /* List of pairs of states */
2298     struct state_set *visited = NULL;   /* List of pairs of states */
2299
2300     if (fa1 == NULL || fa2 == NULL)
2301         return -1;
2302
2303     if (fa1 == fa2)
2304         return 1;
2305
2306     F(determinize(fa2, NULL));
2307     sort_transition_intervals(fa1);
2308     sort_transition_intervals(fa2);
2309
2310     F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2311     F(state_pair_push(&visited, fa1->initial, fa2->initial));
2312     while (worklist->used) {
2313         struct state *p1, *p2;
2314         void *v2;
2315         p1 = state_set_pop_data(worklist, &v2);
2316         p2 = v2;
2317
2318         if (p1->accept && !p2->accept)
2319             goto done;
2320
2321         struct trans *t1 = p1->trans;
2322         struct trans *t2 = p2->trans;
2323         for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2324             while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2325                 b2++;
2326             int min1 = t1[n1].min, max1 = t1[n1].max;
2327             for (int n2 = b2;
2328                  n2 < p2->tused && t1[n1].max >= t2[n2].min;
2329                  n2++) {
2330                 if (t2[n2].min > min1)
2331                     goto done;
2332                 if (t2[n2].max < CHAR_MAX)
2333                     min1 = t2[n2].max + 1;
2334                 else {
2335                     min1 = UCHAR_MAX;
2336                     max1 = UCHAR_MIN;
2337                 }
2338                 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2339                     F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2340                     F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2341                 }
2342             }
2343             if (min1 <= max1)
2344                 goto done;
2345         }
2346     }
2347
2348     result = 1;
2349  done:
2350     state_set_free(worklist);
2351     state_set_free(visited);
2352     return result;
2353  error:
2354     result = -1;
2355     goto done;
2356 }
2357
2358 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2359                            int min, int max) {
2360     int result;
2361
2362     if (fa->nocase) {
2363         /* Never transition on anything in [A-Z] */
2364         if (min > 'Z' || max < 'A') {
2365             result = add_new_trans(s, crash, min, max);
2366         } else if (min >= 'A' && max <= 'Z') {
2367             result = 0;
2368         } else if (max <= 'Z') {
2369             /* min < 'A' */
2370             result = add_new_trans(s, crash, min, 'A' - 1);
2371         } else if (min >= 'A') {
2372             /* max > 'Z' */
2373             result = add_new_trans(s, crash, 'Z' + 1, max);
2374         } else {
2375             /* min < 'A' && max > 'Z' */
2376             result = add_new_trans(s, crash, min, 'A' - 1);
2377             if (result == 0)
2378                 result = add_new_trans(s, crash, 'Z' + 1, max);
2379         }
2380     } else {
2381         result = add_new_trans(s, crash, min, max);
2382     }
2383     return result;
2384 }
2385
2386 static int totalize(struct fa *fa) {
2387     int r;
2388     struct state *crash = add_state(fa, 0);
2389
2390     E(crash == NULL);
2391     F(mark_reachable(fa));
2392     sort_transition_intervals(fa);
2393
2394     r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2395     if (r < 0)
2396         return -1;
2397
2398     list_for_each(s, fa->initial) {
2399         int next = UCHAR_MIN;
2400         int tused = s->tused;
2401         for (int i=0; i < tused; i++) {
2402             uchar min = s->trans[i].min, max = s->trans[i].max;
2403             if (min > next) {
2404                 r = add_crash_trans(fa, s, crash, next, min - 1);
2405                 if (r < 0)
2406                     return -1;
2407             }
2408             if (max + 1 > next)
2409                 next = max + 1;
2410         }
2411         if (next <= UCHAR_MAX) {
2412             r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2413             if (r < 0)
2414                 return -1;
2415         }
2416     }
2417     return 0;
2418  error:
2419     return -1;
2420 }
2421
2422 struct fa *fa_complement(struct fa *fa) {
2423     fa = fa_clone(fa);
2424     E(fa == NULL);
2425     F(determinize(fa, NULL));
2426     F(totalize(fa));
2427     list_for_each(s, fa->initial)
2428         s->accept = ! s->accept;
2429
2430     F(collect(fa));
2431     return fa;
2432  error:
2433     fa_free(fa);
2434     return NULL;
2435 }
2436
2437 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2438     if (fa1 == NULL || fa2 == NULL)
2439         return NULL;
2440
2441     if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2442         return fa_make_empty();
2443     if (fa_is_basic(fa2, FA_EMPTY))
2444         return fa_clone(fa1);
2445
2446     struct fa *cfa2 = fa_complement(fa2);
2447     if (cfa2 == NULL)
2448         return NULL;
2449
2450     struct fa *result = fa_intersect(fa1, cfa2);
2451     fa_free(cfa2);
2452     return result;
2453 }
2454
2455 static int accept_to_accept(struct fa *fa) {
2456     int r;
2457     struct state *s = add_state(fa, 0);
2458     if (s == NULL)
2459         return -1;
2460
2461     F(mark_reachable(fa));
2462     list_for_each(a, fa->initial) {
2463         if (a->accept && a->reachable) {
2464             r = add_epsilon_trans(s, a);
2465             if (r < 0)
2466                 return -1;
2467         }
2468     }
2469
2470     set_initial(fa, s);
2471     fa->deterministic = fa->minimal = 0;
2472     return 0;
2473  error:
2474     return -1;
2475 }
2476
2477 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2478     struct fa *fa = NULL, *eps = NULL, *result = NULL;
2479     struct state_set *map = NULL;
2480
2481     if (fa1 == NULL || fa2 == NULL)
2482         return NULL;
2483
2484     fa1 = fa_clone(fa1);
2485     fa2 = fa_clone(fa2);
2486     if (fa1 == NULL || fa2 == NULL)
2487         goto error;
2488
2489     if (determinize(fa1, NULL) < 0)
2490         goto error;
2491     if (accept_to_accept(fa1) < 0)
2492         goto error;
2493
2494     map = fa_reverse(fa2);
2495     state_set_free(map);
2496     if (determinize(fa2, NULL) < 0)
2497         goto error;
2498     if (accept_to_accept(fa2) < 0)
2499         goto error;
2500     map = fa_reverse(fa2);
2501     state_set_free(map);
2502     if (determinize(fa2, NULL) < 0)
2503         goto error;
2504
2505     fa = fa_intersect(fa1, fa2);
2506     if (fa == NULL)
2507         goto error;
2508
2509     eps = fa_make_epsilon();
2510     if (eps == NULL)
2511         goto error;
2512
2513     result = fa_minus(fa, eps);
2514     if (result == NULL)
2515         goto error;
2516
2517  error:
2518     fa_free(fa1);
2519     fa_free(fa2);
2520     fa_free(fa);
2521     fa_free(eps);
2522     return result;
2523 }
2524
2525 int fa_equals(struct fa *fa1, struct fa *fa2) {
2526     if (fa1 == NULL || fa2 == NULL)
2527         return -1;
2528
2529     /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2530     int c1 = fa_contains(fa1, fa2);
2531     if (c1 < 0)
2532         return -1;
2533     if (c1 == 0)
2534         return 0;
2535     return fa_contains(fa2, fa1);
2536 }
2537
2538 static unsigned int chr_score(char c) {
2539     if (isalpha(c)) {
2540         return 2;
2541     } else if (isalnum(c)) {
2542         return 3;
2543     } else if (isprint(c)) {
2544         return 7;
2545     } else if (c == '\0') {
2546         return 10000;
2547     } else {
2548         return 100;
2549     }
2550 }
2551
2552 static unsigned int str_score(const struct re_str *str) {
2553     unsigned int score = 0;
2554     for (int i=0; i < str->len; i++) {
2555         score += chr_score(str->rx[i]);
2556     }
2557     return score;
2558 }
2559
2560 /* See if we get a better string for DST by appending C to SRC. If DST is
2561  * NULL or empty, always use SRC + C
2562  */
2563 static struct re_str *string_extend(struct re_str *dst,
2564                                     const struct re_str *src,
2565                                     char c) {
2566     if (dst == NULL
2567         || dst->len == 0
2568         || str_score(src) + chr_score(c) < str_score(dst)) {
2569         int slen = src->len;
2570         if (dst == NULL)
2571             dst = make_re_str(NULL);
2572         if (dst == NULL)
2573             return NULL;
2574         if (REALLOC_N(dst->rx, slen+2) < 0) {
2575             free(dst);
2576             return NULL;
2577         }
2578         memcpy(dst->rx, src->rx, slen);
2579         dst->rx[slen] = c;
2580         dst->rx[slen + 1] = '\0';
2581         dst->len = slen + 1;
2582     }
2583     return dst;
2584 }
2585
2586 static char pick_char(struct trans *t) {
2587     for (int c = t->min; c <= t->max; c++)
2588         if (isalpha(c)) return c;
2589     for (int c = t->min; c <= t->max; c++)
2590         if (isalnum(c)) return c;
2591     for (int c = t->min; c <= t->max; c++)
2592         if (isprint(c)) return c;
2593     return t->max;
2594 }
2595
2596 /* Generate an example string for FA. Traverse all transitions and record
2597  * at each turn the "best" word found for that state.
2598  */
2599 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2600     struct re_str *word = NULL;
2601     struct state_set *path = NULL, *worklist = NULL;
2602     struct re_str *str = NULL;
2603
2604     *example = NULL;
2605     *example_len = 0;
2606
2607     /* Sort to avoid any ambiguity because of reordering of transitions */
2608     sort_transition_intervals(fa);
2609
2610     /* Map from state to string */
2611     path = state_set_init(-1, S_DATA|S_SORTED);
2612     str = make_re_str("");
2613     if (path == NULL || str == NULL)
2614         goto error;
2615     F(state_set_push_data(path, fa->initial, str));
2616     str = NULL;
2617
2618     /* List of states still to visit */
2619     worklist = state_set_init(-1, S_NONE);
2620     if (worklist == NULL)
2621         goto error;
2622     F(state_set_push(worklist, fa->initial));
2623
2624     while (worklist->used) {
2625         struct state *s = state_set_pop(worklist);
2626         struct re_str *ps = state_set_find_data(path, s);
2627         for_each_trans(t, s) {
2628             char c = pick_char(t);
2629             int toind = state_set_index(path, t->to);
2630             if (toind == -1) {
2631                 struct re_str *w = string_extend(NULL, ps, c);
2632                 E(w == NULL);
2633                 F(state_set_push(worklist, t->to));
2634                 F(state_set_push_data(path, t->to, w));
2635             } else {
2636                 path->data[toind] = string_extend(path->data[toind], ps, c);
2637             }
2638         }
2639     }
2640
2641     for (int i=0; i < path->used; i++) {
2642         struct state *p = path->states[i];
2643         struct re_str *ps = path->data[i];
2644         if (p->accept &&
2645             (word == NULL || word->len == 0
2646              || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2647             free_re_str(word);
2648             word = ps;
2649         } else {
2650             free_re_str(ps);
2651         }
2652     }
2653     state_set_free(path);
2654     state_set_free(worklist);
2655     if (word != NULL) {
2656         *example_len = word->len;
2657         *example = word->rx;
2658         free(word);
2659     }
2660     return 0;
2661  error:
2662     state_set_free(path);
2663     state_set_free(worklist);
2664     free_re_str(word);
2665     free_re_str(str);
2666     return -1;
2667 }
2668
2669 struct enum_intl {
2670     int       limit;
2671     int       nwords;
2672     char    **words;
2673     char     *buf;
2674     size_t    bsize;
2675 };
2676
2677 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2678     int result = -1;
2679
2680     if (ei->bsize <= pos + 1) {
2681         ei->bsize *= 2;
2682         F(REALLOC_N(ei->buf, ei->bsize));
2683     }
2684
2685     ei->buf[pos] = '\0';
2686     for_each_trans(t, s) {
2687         if (t->to->visited)
2688             return -2;
2689         t->to->visited = 1;
2690         for (int i=t->min; i <= t->max; i++) {
2691             ei->buf[pos] = i;
2692             if (t->to->accept) {
2693                 if (ei->nwords >= ei->limit)
2694                     return -2;
2695                 ei->words[ei->nwords] = strdup(ei->buf);
2696                 E(ei->words[ei->nwords] == NULL);
2697                 ei->nwords += 1;
2698             }
2699             result = fa_enumerate_intl(t->to, ei, pos+1);
2700             E(result < 0);
2701         }
2702         t->to->visited = 0;
2703     }
2704     ei->buf[pos] = '\0';
2705     result = 0;
2706  error:
2707     return result;
2708 }
2709
2710 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2711     struct enum_intl ei;
2712     int result = -1;
2713
2714     *words = NULL;
2715     MEMZERO(&ei, 1);
2716     ei.bsize = 8;                    /* Arbitrary initial size */
2717     ei.limit = limit;
2718     F(ALLOC_N(ei.words, limit));
2719     F(ALLOC_N(ei.buf, ei.bsize));
2720
2721     /* We use the visited bit to track which states we already visited
2722      * during the construction of a word to detect loops */
2723     list_for_each(s, fa->initial)
2724         s->visited = 0;
2725     fa->initial->visited = 1;
2726     if (fa->initial->accept) {
2727         if (ei.nwords >= limit)
2728             return -2;
2729         ei.words[0] = strdup("");
2730         E(ei.words[0] == NULL);
2731         ei.nwords = 1;
2732     }
2733     result = fa_enumerate_intl(fa->initial, &ei, 0);
2734     E(result < 0);
2735
2736     result = ei.nwords;
2737     *words = ei.words;
2738     ei.words = NULL;
2739  done:
2740     free(ei.buf);
2741     return result;
2742
2743  error:
2744     for (int i=0; i < ei.nwords; i++)
2745         free(ei.words[i]);
2746     free(ei.words);
2747     goto done;
2748 }
2749
2750 /* Expand the automaton FA by replacing every transition s(c) -> p from
2751  * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2752  * a new state r.
2753  * If ADD_MARKER is true, also add for each original state s a new a loop
2754  * s(Y) -> q and q(X) -> s through a new state q.
2755  *
2756  * The end result is that an automaton accepting "a|ab" is turned into one
2757  * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2758  * when add_marker is true.
2759  *
2760  * The returned automaton is a copy of FA, FA is not modified.
2761  */
2762 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2763                                   char X, char Y) {
2764     int ret;
2765
2766     fa = fa_clone(fa);
2767     if (fa == NULL)
2768         return NULL;
2769
2770     F(mark_reachable(fa));
2771     list_for_each(p, fa->initial) {
2772         if (! p->reachable)
2773             continue;
2774
2775         struct state *r = add_state(fa, 0);
2776         if (r == NULL)
2777             goto error;
2778         r->trans = p->trans;
2779         r->tused = p->tused;
2780         r->tsize = p->tsize;
2781         p->trans = NULL;
2782         p->tused = p->tsize = 0;
2783         ret = add_new_trans(p, r, X, X);
2784         if (ret < 0)
2785             goto error;
2786         if (add_marker) {
2787             struct state *q = add_state(fa, 0);
2788             ret = add_new_trans(p, q, Y, Y);
2789             if (ret < 0)
2790                 goto error;
2791             ret = add_new_trans(q, p, X, X);
2792             if (ret < 0)
2793                 goto error;
2794         }
2795     }
2796     return fa;
2797  error:
2798     fa_free(fa);
2799     return NULL;
2800 }
2801
2802 static bitset *alphabet(struct fa *fa) {
2803     bitset *bs = bitset_init(UCHAR_NUM);
2804
2805     if (bs == NULL)
2806         return NULL;
2807
2808     list_for_each(s, fa->initial) {
2809         for (int i=0; i < s->tused; i++) {
2810             for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2811                 bitset_set(bs, c);
2812         }
2813     }
2814     return bs;
2815 }
2816
2817 static bitset *last_chars(struct fa *fa) {
2818     bitset *bs = bitset_init(UCHAR_NUM);
2819
2820     if (bs == NULL)
2821         return NULL;
2822
2823     list_for_each(s, fa->initial) {
2824         for (int i=0; i < s->tused; i++) {
2825             if (s->trans[i].to->accept) {
2826                 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2827                     bitset_set(bs, c);
2828             }
2829         }
2830     }
2831     return bs;
2832 }
2833
2834 static bitset *first_chars(struct fa *fa) {
2835     bitset *bs = bitset_init(UCHAR_NUM);
2836     struct state *s = fa->initial;
2837
2838     if (bs == NULL)
2839         return NULL;
2840
2841     for (int i=0; i < s->tused; i++) {
2842         for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2843             bitset_set(bs, c);
2844     }
2845     return bs;
2846 }
2847
2848 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2849  * according to simple heuristics. Return 0 if they need to be checked
2850  * further to decide ambiguity
2851  * Return -1 if an allocation fails
2852  */
2853 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2854     bitset *alpha1 = NULL;
2855     bitset *alpha2 = NULL;
2856     bitset *last1 = NULL;
2857     bitset *first2 = NULL;
2858     bool result = -1;
2859
2860     alpha2 = alphabet(fa2);
2861     last1 = last_chars(fa1);
2862     if (alpha2 == NULL || last1 == NULL)
2863         goto done;
2864     if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2865         result = 1;
2866         goto done;
2867     }
2868
2869     alpha1 = alphabet(fa1);
2870     first2 = first_chars(fa2);
2871     if (alpha1 == NULL || first2 == NULL)
2872         goto done;
2873     if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2874         result = 1;
2875         goto done;
2876     }
2877     result = 0;
2878  done:
2879     bitset_free(alpha1);
2880     bitset_free(alpha2);
2881     bitset_free(last1);
2882     bitset_free(first2);
2883     return result;
2884 }
2885
2886 /* This algorithm is due to Anders Moeller, and can be found in class
2887  * AutomatonOperations in dk.brics.grammar
2888  */
2889 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2890                      char **upv, size_t *upv_len,
2891                      char **pv, char **v) {
2892     static const char X = '\001';
2893     static const char Y = '\002';
2894     char *result = NULL, *s = NULL;
2895     size_t result_len = 0;
2896     int ret = -1, r;
2897     struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2898     struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2899     struct fa *b1 = NULL, *b2 = NULL;
2900
2901     *upv = NULL;
2902     *upv_len = 0;
2903     if (pv != NULL)
2904         *pv = NULL;
2905     if (v != NULL)
2906         *v = NULL;
2907
2908     r = is_splittable(fa1, fa2);
2909     if (r < 0)
2910         goto error;
2911     if (r == 1)
2912         return 0;
2913
2914 #define Xs "\001"
2915 #define Ys "\002"
2916 #define MPs Ys Xs "(" Xs "(.|\n))+"
2917 #define MSs Ys Xs "(" Xs "(.|\n))*"
2918 #define SPs "(" Xs "(.|\n))+" Ys Xs
2919 #define SSs "(" Xs "(.|\n))*" Ys Xs
2920     /* These could become static constants */
2921     r = fa_compile( MPs, strlen(MPs), &mp);
2922     if (r != REG_NOERROR)
2923         goto error;
2924     r = fa_compile( MSs, strlen(MSs), &ms);
2925     if (r != REG_NOERROR)
2926         goto error;
2927     r = fa_compile( SPs, strlen(SPs), &sp);
2928     if (r != REG_NOERROR)
2929         goto error;
2930     r = fa_compile( SSs, strlen(SSs), &ss);
2931     if (r != REG_NOERROR)
2932         goto error;
2933 #undef SSs
2934 #undef SPs
2935 #undef MSs
2936 #undef MPs
2937 #undef Xs
2938 #undef Ys
2939
2940     a1f = expand_alphabet(fa1, 0, X, Y);
2941     a1t = expand_alphabet(fa1, 1, X, Y);
2942     a2f = expand_alphabet(fa2, 0, X, Y);
2943     a2t = expand_alphabet(fa2, 1, X, Y);
2944     if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2945         goto error;
2946
2947     /* Compute b1 = ((a1f . mp) & a1t) . ms */
2948     if (concat_in_place(a1f, &mp) < 0)
2949         goto error;
2950     b1 = fa_intersect(a1f, a1t);
2951     if (b1 == NULL)
2952         goto error;
2953     if (concat_in_place(b1, &ms) < 0)
2954         goto error;
2955
2956     /* Compute b2 = ss . ((sp . a2f) & a2t) */
2957     if (concat_in_place(sp, &a2f) < 0)
2958         goto error;
2959     b2 = fa_intersect(sp, a2t);
2960     if (b2 == NULL)
2961         goto error;
2962     if (concat_in_place(ss, &b2) < 0)
2963         goto error;
2964     b2 = ss;
2965     ss = NULL;
2966
2967     /* The automaton we are really interested in */
2968     amb = fa_intersect(b1, b2);
2969     if (amb == NULL)
2970         goto error;
2971
2972     size_t s_len = 0;
2973     r = fa_example(amb, &s, &s_len);
2974     if (r < 0)
2975         goto error;
2976
2977     if (s != NULL) {
2978         char *t;
2979         result_len = (s_len-1)/2 - 1;
2980         F(ALLOC_N(result, result_len + 1));
2981         t = result;
2982         int i = 0;
2983         for (i=0; s[2*i] == X; i++) {
2984             assert((t - result) < result_len);
2985             *t++ = s[2*i + 1];
2986         }
2987         if (pv != NULL)
2988             *pv = t;
2989         i += 1;
2990
2991         for ( ;s[2*i] == X; i++) {
2992             assert((t - result) < result_len);
2993             *t++ = s[2*i + 1];
2994         }
2995         if (v != NULL)
2996             *v = t;
2997         i += 1;
2998
2999         for (; 2*i+1 < s_len; i++) {
3000             assert((t - result) < result_len);
3001             *t++ = s[2*i + 1];
3002         }
3003     }
3004     ret = 0;
3005
3006  done:
3007     /* Clean up intermediate automata */
3008     fa_free(mp);
3009     fa_free(ms);
3010     fa_free(ss);
3011     fa_free(sp);
3012     fa_free(a1f);
3013     fa_free(a1t);
3014     fa_free(a2f);
3015     fa_free(a2t);
3016     fa_free(b1);
3017     fa_free(b2);
3018     fa_free(amb);
3019
3020     FREE(s);
3021     *upv = result;
3022     if (result != NULL)
3023         *upv_len = result_len;
3024     return ret;
3025  error:
3026     FREE(result);
3027     ret = -1;
3028     goto done;
3029 }
3030
3031 /*
3032  * Construct an fa from a regular expression
3033  */
3034 static struct fa *fa_from_re(struct re *re) {
3035     struct fa *result = NULL;
3036
3037     switch(re->type) {
3038     case UNION:
3039         {
3040             result = fa_from_re(re->exp1);
3041             if (result == NULL)
3042                 goto error;
3043             struct fa *fa2 = fa_from_re(re->exp2);
3044             if (fa2 == NULL)
3045                 goto error;
3046             if (union_in_place(result, &fa2) < 0)
3047                 goto error;
3048         }
3049         break;
3050     case CONCAT:
3051         {
3052             result = fa_from_re(re->exp1);
3053             if (result == NULL)
3054                 goto error;
3055             struct fa *fa2 = fa_from_re(re->exp2);
3056             if (fa2 == NULL)
3057                 goto error;
3058             if (concat_in_place(result, &fa2) < 0)
3059                 goto error;
3060         }
3061         break;
3062     case CSET:
3063         result = fa_make_char_set(re->cset, re->negate);
3064         break;
3065     case ITER:
3066         {
3067             struct fa *fa = fa_from_re(re->exp);
3068             if (fa == NULL)
3069                 goto error;
3070             result = fa_iter(fa, re->min, re->max);
3071             fa_free(fa);
3072         }
3073         break;
3074     case EPSILON:
3075         result = fa_make_epsilon();
3076         break;
3077     case CHAR:
3078         result = fa_make_char(re->c);
3079         break;
3080     default:
3081         assert(0);
3082         break;
3083     }
3084     return result;
3085  error:
3086     fa_free(result);
3087     return NULL;
3088 }
3089
3090 static void free_re(struct re *re) {
3091     if (re == NULL)
3092         return;
3093     assert(re->ref == 0);
3094
3095     if (re->type == UNION || re->type == CONCAT) {
3096         re_unref(re->exp1);
3097         re_unref(re->exp2);
3098     } else if (re->type == ITER) {
3099         re_unref(re->exp);
3100     } else if (re->type == CSET) {
3101         bitset_free(re->cset);
3102     }
3103     free(re);
3104 }
3105
3106 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3107     struct re *re = NULL;
3108     struct re_parse parse;
3109
3110     *fa = NULL;
3111
3112     parse.rx = regexp;
3113     parse.rend = regexp + size;
3114     parse.error = REG_NOERROR;
3115
3116     re = parse_regexp(&parse);
3117     if (re == NULL)
3118         return parse.error;
3119
3120     *fa = fa_from_re(re);
3121     re_unref(re);
3122
3123     if (*fa == NULL || collect(*fa) < 0)
3124         parse.error = REG_ESPACE;
3125     return parse.error;
3126 }
3127
3128 /* We represent a case-insensitive FA by using only transitions on
3129  * lower-case letters.
3130  */
3131 int fa_nocase(struct fa *fa) {
3132     if (fa == NULL || fa->nocase)
3133         return 0;
3134
3135     fa->nocase = 1;
3136     list_for_each(s, fa->initial) {
3137         int tused = s->tused;
3138         /* For every transition on characters in [A-Z] add a corresponding
3139          * transition on [a-z]; remove any portion covering [A-Z] */
3140         for (int i=0; i < tused; i++) {
3141             struct trans *t = s->trans + i;
3142             int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3143             int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3144
3145             if (t->min > 'Z' || t->max < 'A')
3146                 continue;
3147             if (t->min >= 'A' && t->max <= 'Z') {
3148                 t->min = tolower(t->min);
3149                 t->max = tolower(t->max);
3150             } else if (t->max <= 'Z') {
3151                 /* t->min < 'A' */
3152                 t->max = 'A' - 1;
3153                 F(add_new_trans(s, t->to, lc_min, lc_max));
3154             } else if (t->min >= 'A') {
3155                 /* t->max > 'Z' */
3156                 t->min = 'Z' + 1;
3157                 F(add_new_trans(s, t->to, lc_min, lc_max));
3158             } else {
3159                 /* t->min < 'A' && t->max > 'Z' */
3160                 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3161                 s->trans[i].max = 'A' - 1;
3162                 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3163             }
3164         }
3165     }
3166     F(collect(fa));
3167     return 0;
3168  error:
3169     return -1;
3170 }
3171
3172 int fa_is_nocase(struct fa *fa) {
3173     return fa->nocase;
3174 }
3175
3176 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3177  * adding transitions on upper-case letters for each existing transition on
3178  * lower-case letters */
3179 static int case_expand(struct fa *fa) {
3180     if (! fa->nocase)
3181         return 0;
3182
3183     fa->nocase = 0;
3184     list_for_each(s, fa->initial) {
3185         int tused = s->tused;
3186         /* For every transition on characters in [a-z] add a corresponding
3187          * transition on [A-Z] */
3188         for (int i=0; i < tused; i++) {
3189             struct trans *t = s->trans + i;
3190             int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3191             int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3192
3193             if (t->min > 'z' || t->max < 'a')
3194                 continue;
3195             F(add_new_trans(s, t->to, lc_min, lc_max));
3196         }
3197     }
3198     F(collect(fa));
3199     return 0;
3200  error:
3201     return -1;
3202 }
3203
3204 /*
3205  * Regular expression parser
3206  */
3207
3208 static struct re *make_re(enum re_type type) {
3209     struct re *re;
3210     if (make_ref(re) == 0)
3211         re->type = type;
3212     return re;
3213 }
3214
3215 static struct re *make_re_rep(struct re *exp, int min, int max) {
3216     struct re *re = make_re(ITER);
3217     if (re) {
3218         re->exp = exp;
3219         re->min = min;
3220         re->max = max;
3221     } else {
3222         re_unref(exp);
3223     }
3224     return re;
3225 }
3226
3227 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3228                                 struct re *exp2) {
3229     struct re *re = make_re(type);
3230     if (re) {
3231         re->exp1 = exp1;
3232         re->exp2 = exp2;
3233     } else {
3234         re_unref(exp1);
3235         re_unref(exp2);
3236     }
3237     return re;
3238 }
3239
3240 static struct re *make_re_char(uchar c) {
3241     struct re *re = make_re(CHAR);
3242     if (re)
3243         re->c = c;
3244     return re;
3245 }
3246
3247 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3248     struct re *re = make_re(CSET);
3249     if (re) {
3250         re->negate = negate;
3251         re->no_ranges = no_ranges;
3252         re->cset = bitset_init(UCHAR_NUM);
3253         if (re->cset == NULL)
3254             re_unref(re);
3255     }
3256     return re;
3257 }
3258
3259 static bool more(struct re_parse *parse) {
3260     return parse->rx < parse->rend;
3261 }
3262
3263 static bool match(struct re_parse *parse, char m) {
3264     if (!more(parse))
3265         return false;
3266     if (*parse->rx == m) {
3267         parse->rx += 1;
3268         return true;
3269     }
3270     return false;
3271 }
3272
3273 static bool peek(struct re_parse *parse, const char *chars) {
3274     return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3275 }
3276
3277 static bool next(struct re_parse *parse, char *c) {
3278     if (!more(parse))
3279         return false;
3280     *c = *parse->rx;
3281     parse->rx += 1;
3282     return true;
3283 }
3284
3285 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3286     if (!more(parse))
3287         return false;
3288     if (quoted && *parse->rx == '\\') {
3289         parse->rx += 1;
3290         return next(parse, c);
3291     } else {
3292         return next(parse, c);
3293     }
3294 }
3295
3296 static void add_re_char(struct re *re, uchar from, uchar to) {
3297     assert(re->type == CSET);
3298     for (unsigned int c = from; c <= to; c++)
3299         bitset_set(re->cset, c);
3300 }
3301
3302 static void parse_char_class(struct re_parse *parse, struct re *re) {
3303     if (! more(parse)) {
3304         parse->error = REG_EBRACK;
3305         goto error;
3306     }
3307     char from, to;
3308     parse_char(parse, 0, &from);
3309     to = from;
3310     if (match(parse, '-')) {
3311         if (! more(parse)) {
3312             parse->error = REG_EBRACK;
3313             goto error;
3314         }
3315         if (peek(parse, "]")) {
3316             if (from > to) {
3317                 parse->error = REG_ERANGE;
3318                 goto error;
3319             }
3320             add_re_char(re, from, to);
3321             add_re_char(re, '-', '-');
3322             return;
3323         } else if (!parse_char(parse, 0, &to)) {
3324             parse->error = REG_ERANGE;
3325             goto error;
3326         }
3327     }
3328     if (from > to) {
3329         parse->error = REG_ERANGE;
3330         goto error;
3331     }
3332     add_re_char(re, from, to);
3333  error:
3334     return;
3335 }
3336
3337 static struct re *parse_simple_exp(struct re_parse *parse) {
3338     struct re *re = NULL;
3339
3340     if (match(parse, '[')) {
3341         bool negate = match(parse, '^');
3342         re = make_re_char_set(negate, parse->no_ranges);
3343         if (re == NULL) {
3344             parse->error = REG_ESPACE;
3345             goto error;
3346         }
3347         parse_char_class(parse, re);
3348         if (parse->error != REG_NOERROR)
3349             goto error;
3350         while (more(parse) && ! peek(parse, "]")) {
3351             parse_char_class(parse, re);
3352             if (parse->error != REG_NOERROR)
3353                 goto error;
3354         }
3355         if (! match(parse, ']')) {
3356             parse->error = REG_EBRACK;
3357             goto error;
3358         }
3359     } else if (match(parse, '(')) {
3360         if (match(parse, ')')) {
3361             re = make_re(EPSILON);
3362             if (re == NULL) {
3363                 parse->error = REG_ESPACE;
3364                 goto error;
3365             }
3366         } else {
3367             re = parse_regexp(parse);
3368             if (re == NULL)
3369                 goto error;
3370             if (! match(parse, ')')) {
3371                 parse->error = REG_EPAREN;
3372                 goto error;
3373             }
3374         }
3375     } else if (match(parse, '.')) {
3376         re = make_re_char_set(1, parse->no_ranges);
3377         if (re == NULL) {
3378             parse->error = REG_ESPACE;
3379             goto error;
3380         }
3381         add_re_char(re, '\n', '\n');
3382     } else if (more(parse)) {
3383         char c;
3384         if (!parse_char(parse, 1, &c)) {
3385             parse->error = REG_EESCAPE;
3386             goto error;
3387         }
3388         re = make_re_char(c);
3389         if (re == NULL) {
3390             parse->error = REG_ESPACE;
3391             goto error;
3392         }
3393     } else {
3394         re = make_re(EPSILON);
3395         if (re == NULL) {
3396             parse->error = REG_ESPACE;
3397             goto error;
3398         }
3399     }
3400     return re;
3401  error:
3402     re_unref(re);
3403     return NULL;
3404 }
3405
3406 static int parse_int(struct re_parse *parse) {
3407     const char *lim;
3408     char *end;
3409     size_t used;
3410     long l;
3411
3412     /* We need to be careful that strtoul will never access
3413      * memory beyond parse->rend
3414      */
3415     for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3416          lim++);
3417     if (lim < parse->rend) {
3418         l = strtoul(parse->rx, &end, 10);
3419         used = end - parse->rx;
3420     } else {
3421         char *s = strndup(parse->rx, parse->rend - parse->rx);
3422         if (s == NULL) {
3423             parse->error = REG_ESPACE;
3424             return -1;
3425         }
3426         l = strtoul(s, &end, 10);
3427         used = end - s;
3428         free(s);
3429     }
3430
3431     if (used == 0)
3432         return -1;
3433     parse->rx += used;
3434     if ((l<0) || (l > INT_MAX)) {
3435         parse->error = REG_BADBR;
3436         return -1;
3437     }
3438     return (int) l;
3439 }
3440
3441 static struct re *parse_repeated_exp(struct re_parse *parse) {
3442     struct re *re = parse_simple_exp(parse);
3443     if (re == NULL)
3444         goto error;
3445     if (match(parse, '?')) {
3446         re = make_re_rep(re, 0, 1);
3447     } else if (match(parse, '*')) {
3448         re = make_re_rep(re, 0, -1);
3449     } else if (match(parse, '+')) {
3450         re = make_re_rep(re, 1, -1);
3451     } else if (match(parse, '{')) {
3452         int min, max;
3453         min = parse_int(parse);
3454         if (min == -1)
3455             goto error;
3456         if (match(parse, ',')) {
3457             max = parse_int(parse);
3458             if (max == -1)
3459                 max = -1;      /* If it's not an int, it means 'unbounded' */
3460             if (! match(parse, '}')) {
3461                 parse->error = REG_EBRACE;
3462                 goto error;
3463             }
3464         } else if (match(parse, '}')) {
3465             max = min;
3466         } else {
3467             parse->error = REG_EBRACE;
3468             goto error;
3469         }
3470         if (min > max && max != -1) {
3471             parse->error = REG_BADBR;
3472             goto error;
3473         }
3474         re = make_re_rep(re, min, max);
3475     }
3476     if (re == NULL)
3477         parse->error = REG_ESPACE;
3478     return re;
3479  error:
3480     re_unref(re);
3481     return NULL;
3482 }
3483
3484 static struct re *parse_concat_exp(struct re_parse *parse) {
3485     struct re *re = parse_repeated_exp(parse);
3486     if (re == NULL)
3487         goto error;
3488
3489     if (more(parse) && ! peek(parse, ")|")) {
3490         struct re *re2 = parse_concat_exp(parse);
3491         if (re2 == NULL)
3492             goto error;
3493         re = make_re_binop(CONCAT, re, re2);
3494         if (re == NULL) {
3495             parse->error = REG_ESPACE;
3496             goto error;
3497         }
3498     }
3499     return re;
3500
3501  error:
3502     re_unref(re);
3503     return NULL;
3504 }
3505
3506 static struct re *parse_regexp(struct re_parse *parse) {
3507     struct re *re = NULL;
3508
3509     /* Something like (|r) */
3510     if (peek(parse, "|"))
3511         re = make_re(EPSILON);
3512     else
3513         re = parse_concat_exp(parse);
3514     if (re == NULL)
3515         goto error;
3516
3517     if (match(parse, '|')) {
3518         struct re *re2 = NULL;
3519         /* Something like (r|) */
3520         if (peek(parse, ")"))
3521             re2 = make_re(EPSILON);
3522         else
3523             re2 = parse_regexp(parse);
3524         if (re2 == NULL)
3525             goto error;
3526
3527         re = make_re_binop(UNION, re, re2);
3528         if (re == NULL) {
3529             parse->error = REG_ESPACE;
3530             goto error;
3531         }
3532     }
3533     return re;
3534
3535  error:
3536     re_unref(re);
3537     return NULL;
3538 }
3539
3540 /*
3541  * Convert a STRUCT RE to a string. Code is complicated by the fact that
3542  * we try to be clever and avoid unneeded parens and concatenation with
3543  * epsilon etc.
3544  */
3545 static int re_as_string(const struct re *re, struct re_str *str);
3546
3547 static int re_binop_count(enum re_type type, const struct re *re) {
3548     assert(type == CONCAT || type == UNION);
3549     if (re->type == type) {
3550         return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3551     } else {
3552         return 1;
3553     }
3554 }
3555
3556 static int re_binop_store(enum re_type type, const struct re *re,
3557                           const struct re **list) {
3558     int pos = 0;
3559     if (type == re->type) {
3560         pos = re_binop_store(type, re->exp1, list);
3561         pos += re_binop_store(type, re->exp2, list + pos);
3562     } else {
3563         list[0] = re;
3564         pos = 1;
3565     }
3566     return pos;
3567 }
3568
3569 static int re_union_as_string(const struct re *re, struct re_str *str) {
3570     assert(re->type == UNION);
3571
3572     int result = -1;
3573     const struct re **res = NULL;
3574     struct re_str *strings = NULL;
3575     int nre = 0, r;
3576
3577     nre = re_binop_count(re->type, re);
3578     r = ALLOC_N(res, nre);
3579     if (r < 0)
3580         goto done;
3581
3582     re_binop_store(re->type, re, res);
3583
3584     r = ALLOC_N(strings, nre);
3585     if (r < 0)
3586         goto error;
3587
3588     str->len = 0;
3589     for (int i=0; i < nre; i++) {
3590         if (re_as_string(res[i], strings + i) < 0)
3591             goto error;
3592         str->len += strings[i].len;
3593     }
3594     str->len += nre-1;
3595
3596     r = re_str_alloc(str);
3597     if (r < 0)
3598         goto error;
3599
3600     char *p = str->rx;
3601     for (int i=0; i < nre; i++) {
3602         if (i>0)
3603             *p++ = '|';
3604         memcpy(p, strings[i].rx, strings[i].len);
3605         p += strings[i].len;
3606     }
3607
3608     result = 0;
3609  done:
3610     free(res);
3611     if (strings != NULL) {
3612         for (int i=0; i < nre; i++)
3613             release_re_str(strings + i);
3614     }
3615     free(strings);
3616     return result;
3617  error:
3618     release_re_str(str);
3619     result = -1;
3620     goto done;
3621 }
3622
3623 ATTRIBUTE_PURE
3624 static int re_needs_parens_in_concat(const struct re *re) {
3625     return (re->type != CHAR && re->type != CSET && re->type != ITER);
3626 }
3627
3628 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3629     assert(re->type == CONCAT);
3630
3631     const struct re **res = NULL;
3632     struct re_str *strings = NULL;
3633     int nre = 0, r;
3634     int result = -1;
3635
3636     nre = re_binop_count(re->type, re);
3637     r = ALLOC_N(res, nre);
3638     if (r < 0)
3639         goto error;
3640     re_binop_store(re->type, re, res);
3641
3642     r = ALLOC_N(strings, nre);
3643     if (r < 0)
3644         goto error;
3645
3646     str->len = 0;
3647     for (int i=0; i < nre; i++) {
3648         if (res[i]->type == EPSILON)
3649             continue;
3650         if (re_as_string(res[i], strings + i) < 0)
3651             goto error;
3652         str->len += strings[i].len;
3653         if (re_needs_parens_in_concat(res[i]))
3654             str->len += 2;
3655     }
3656
3657     r = re_str_alloc(str);
3658     if (r < 0)
3659         goto error;
3660
3661     char *p = str->rx;
3662     for (int i=0; i < nre; i++) {
3663         if (res[i]->type == EPSILON)
3664             continue;
3665         if (re_needs_parens_in_concat(res[i]))
3666             *p++ = '(';
3667         p = memcpy(p, strings[i].rx, strings[i].len);
3668         p += strings[i].len;
3669         if (re_needs_parens_in_concat(res[i]))
3670             *p++ = ')';
3671     }
3672
3673     result = 0;
3674  done:
3675     free(res);
3676     if (strings != NULL) {
3677         for (int i=0; i < nre; i++)
3678             release_re_str(strings + i);
3679     }
3680     free(strings);
3681     return result;
3682  error:
3683     release_re_str(str);
3684     result = -1;
3685     goto done;
3686 }
3687
3688 static bool cset_contains(const struct re *cset, int c) {
3689     return bitset_get(cset->cset, c) != cset->negate;
3690 }
3691
3692 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3693     const uchar rbrack = ']';
3694     const uchar dash = '-';
3695     const uchar nul = '\0';
3696
3697     static const char *const empty_set = "[]";
3698     static const char *const total_set = "(.|\n)";
3699     static const char *const not_newline = ".";
3700
3701     char *s;
3702     int from, to, negate;
3703     size_t len;
3704     int incl_rbrack, incl_dash;
3705     int r;
3706
3707     str->len = strlen(empty_set);
3708
3709     /* We can not include NUL explicitly in a CSET since we use ordinary
3710        NUL delimited strings to represent them. That means that we need to
3711        use negated representation if NUL is to be included (and vice versa)
3712     */
3713     negate = cset_contains(re, nul);
3714     if (negate) {
3715         for (from = UCHAR_MIN;
3716              from <= UCHAR_MAX && cset_contains(re, from);
3717              from += 1);
3718         if (from > UCHAR_MAX) {
3719             /* Special case: the set matches every character */
3720             str->rx = strdup(total_set);
3721             goto done;
3722         }
3723         if (from == '\n') {
3724             for (from += 1;
3725                  from <= UCHAR_MAX && cset_contains(re, from);
3726                  from += 1);
3727             if (from > UCHAR_MAX) {
3728                 /* Special case: the set matches everything but '\n' */
3729                 str->rx = strdup(not_newline);
3730                 goto done;
3731             }
3732         }
3733     }
3734
3735     /* See if ']' and '-' will be explicitly included in the character set
3736        (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3737        these flags if they are in the set, but not mentioned explicitly
3738     */
3739     incl_rbrack = cset_contains(re, rbrack) != negate;
3740     incl_dash = cset_contains(re, dash) != negate;
3741
3742     if (re->no_ranges) {
3743         for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3744             if (cset_contains(re, from) != negate)
3745                 str->len += 1;
3746     } else {
3747         for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3748             while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3749                 from += 1;
3750             if (from > UCHAR_MAX)
3751                 break;
3752             for (to = from;
3753                  to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3754                  to++);
3755
3756             if (to == from && (from == rbrack || from == dash))
3757                 continue;
3758             if (from == rbrack || from == dash)
3759                 from += 1;
3760             if (to == rbrack || to == dash)
3761                 to -= 1;
3762
3763             len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3764
3765             if (from < rbrack && rbrack < to)
3766                 incl_rbrack = 0;
3767             if (from < dash && dash < to)
3768                 incl_dash = 0;
3769             str->len += len;
3770         }
3771         str->len += incl_rbrack + incl_dash;
3772     }
3773     if (negate)
3774         str->len += 1;        /* For the ^ */
3775
3776     r = re_str_alloc(str);
3777     if (r < 0)
3778         goto error;
3779
3780     s = str->rx;
3781     *s++ = '[';
3782     if (negate)
3783         *s++ = '^';
3784     if (incl_rbrack)
3785         *s++ = rbrack;
3786
3787     if (re->no_ranges) {
3788         for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3789             if (from == rbrack || from == dash)
3790                 continue;
3791             if (cset_contains(re, from) != negate)
3792                 *s++ = from;
3793         }
3794     } else {
3795         for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3796             while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3797                 from += 1;
3798             if (from > UCHAR_MAX)
3799                 break;
3800             for (to = from;
3801                  to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3802                  to++);
3803
3804             if (to == from && (from == rbrack || from == dash))
3805                 continue;
3806             if (from == rbrack || from == dash)
3807                 from += 1;
3808             if (to == rbrack || to == dash)
3809                 to -= 1;
3810
3811             if (to == from) {
3812                 *s++ = from;
3813             } else if (to == from + 1) {
3814                 *s++ = from;
3815                 *s++ = to;
3816             } else {
3817                 *s++ = from;
3818                 *s++ = '-';
3819                 *s++ = to;
3820             }
3821         }
3822     }
3823     if (incl_dash)
3824         *s++ = dash;
3825
3826     *s = ']';
3827  done:
3828     if (str->rx == NULL)
3829         goto error;
3830     str->len = strlen(str->rx);
3831     return 0;
3832  error:
3833     release_re_str(str);
3834     return -1;
3835 }
3836
3837 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3838     const char *quant = NULL;
3839     char *iter = NULL;
3840     int r, result = -1;
3841
3842     if (re_as_string(re->exp, str) < 0)
3843         return -1;
3844
3845     if (re->min == 0 && re->max == -1) {
3846         quant = "*";
3847     } else if (re->min == 1 && re->max == -1) {
3848         quant = "+";
3849     } else if (re->min == 0 && re->max == 1) {
3850         quant = "?";
3851     } else if (re->max == -1) {
3852         r = asprintf(&iter, "{%d,}", re->min);
3853         if (r < 0)
3854             return -1;
3855         quant = iter;
3856     } else {
3857         r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3858         if (r < 0)
3859             return -1;
3860         quant = iter;
3861     }
3862
3863     if (re->exp->type == CHAR || re->exp->type == CSET) {
3864         if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3865             goto error;
3866         strcpy(str->rx + str->len, quant);
3867         str->len += strlen(quant);
3868     } else {
3869         /* Format '(' + str->rx ')' + quant */
3870         if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3871             goto error;
3872         memmove(str->rx + 1, str->rx, str->len);
3873         str->rx[0] = '(';
3874         str->rx[str->len + 1] = ')';
3875         str->len += 2;
3876         strcpy(str->rx + str->len, quant);
3877         str->len += strlen(quant);
3878     }
3879
3880     result = 0;
3881  done:
3882     FREE(iter);
3883     return result;
3884  error:
3885     release_re_str(str);
3886     goto done;
3887 }
3888
3889 static int re_as_string(const struct re *re, struct re_str *str) {
3890     /* Characters that must be escaped */
3891     static const char * const special_chars = ".()[]{}*|+?\\^$";
3892     int result = 0;
3893
3894     switch(re->type) {
3895     case UNION:
3896         result = re_union_as_string(re, str);
3897         break;
3898     case CONCAT:
3899         result = re_concat_as_string(re, str);
3900         break;
3901     case CSET:
3902         result = re_cset_as_string(re, str);
3903         break;
3904     case CHAR:
3905         if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3906             if (ALLOC_N(str->rx, 2) < 0)
3907                 goto error;
3908             str->rx[0] = re->c;
3909             str->len = 1;
3910         } else {
3911             if (ALLOC_N(str->rx, 3) < 0)
3912                 goto error;
3913             str->rx[0] = '\\';
3914             str->rx[1] = re->c;
3915             str->len = strlen(str->rx);
3916         }
3917         break;
3918     case ITER:
3919         result = re_iter_as_string(re, str);
3920         break;
3921     case EPSILON:
3922         if (ALLOC_N(str->rx, 3) < 0)
3923             goto error;
3924         strcpy(str->rx, "()");
3925         str->len = strlen(str->rx);
3926         break;
3927     default:
3928         assert(0);
3929         abort();
3930         break;
3931     }
3932     return result;
3933  error:
3934     release_re_str(str);
3935     return -1;
3936 }
3937
3938 static int convert_trans_to_re(struct state *s) {
3939     struct re *re = NULL;
3940     size_t nto = 1;
3941     struct trans *trans = NULL;
3942     int r;
3943
3944     if (s->tused == 0)
3945         return 0;
3946
3947     qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3948     for (int i = 0; i < s->tused - 1; i++) {
3949         if (s->trans[i].to != s->trans[i+1].to)
3950             nto += 1;
3951     }
3952     r = ALLOC_N(trans, nto);
3953     if (r < 0)
3954         goto error;
3955
3956     struct state *to = s->trans[0].to;
3957     int tind = 0;
3958     for_each_trans(t, s) {
3959         if (t->to != to) {
3960             trans[tind].to = to;
3961             trans[tind].re = re;
3962             tind += 1;
3963             re = NULL;
3964             to = t->to;
3965         }
3966         if (re == NULL) {
3967             re = make_re_char_set(0, 0);
3968             if (re == NULL)
3969                 goto error;
3970         }
3971         add_re_char(re, t->min, t->max);
3972     }
3973     assert(nto == tind + 1);
3974     trans[tind].to = to;
3975     trans[tind].re = re;
3976
3977     /* Simplify CSETs with a single char to a CHAR */
3978     for (int t=0; t < nto; t++) {
3979         int cnt = 0;
3980         uchar chr = UCHAR_MIN;
3981         for (int c = 0; c < UCHAR_NUM; c++) {
3982             if (bitset_get(trans[t].re->cset, c)) {
3983                 cnt += 1;
3984                 chr = c;
3985             }
3986         }
3987         if (cnt == 1) {
3988             re_unref(trans[t].re);
3989             trans[t].re = make_re_char(chr);
3990             if (trans[t].re == NULL)
3991                 goto error;
3992         }
3993     }
3994     free_trans(s);
3995     s->trans = trans;
3996     s->tused = s->tsize = nto;
3997     return 0;
3998
3999  error:
4000     if (trans)
4001         for (int i=0; i < nto; i++)
4002             unref(trans[i].re, re);
4003     free(trans);
4004     return -1;
4005 }
4006
4007 ATTRIBUTE_RETURN_CHECK
4008 static int add_new_re_trans(struct state *s1, struct state *s2,
4009                             struct re *re) {
4010     int r;
4011     r = add_new_trans(s1, s2, 0, 0);
4012     if (r < 0)
4013         return -1;
4014     last_trans(s1)->re = re;
4015     return 0;
4016 }
4017
4018 /* Add the regular expression R1 . LOOP* . R2 to the transition
4019    from S1 to S2. */
4020 static int re_collapse_trans(struct state *s1, struct state *s2,
4021                              struct re *r1, struct re *loop, struct re *r2) {
4022     struct re *re = NULL;
4023
4024     if (loop->type != EPSILON) {
4025         loop = make_re_rep(ref(loop), 0, -1);
4026         if (loop == NULL)
4027             goto error;
4028     }
4029
4030     if (r1->type == EPSILON) {
4031         if (loop->type == EPSILON) {
4032             re = ref(r2);
4033         } else {
4034             re = make_re_binop(CONCAT, loop, ref(r2));
4035         }
4036     } else {
4037         if (loop->type == EPSILON) {
4038             if (r2->type == EPSILON) {
4039                 re = ref(r1);
4040             } else {
4041                 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4042             }
4043         } else {
4044             re = make_re_binop(CONCAT, ref(r1), loop);
4045             if (re != NULL && r2->type != EPSILON) {
4046                 re = make_re_binop(CONCAT, re, ref(r2));
4047             }
4048         }
4049     }
4050     if (re == NULL)
4051         goto error;
4052
4053     struct trans *t = NULL;
4054     for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4055     if (t > last_trans(s1)) {
4056         if (add_new_re_trans(s1, s2, re) < 0)
4057             goto error;
4058     } else {
4059         if (t->re == NULL) {
4060             t->re = re;
4061         } else {
4062             t->re = make_re_binop(UNION, re, t->re);
4063             if (t->re == NULL)
4064                 goto error;
4065         }
4066     }
4067     return 0;
4068  error:
4069     // FIXME: make sure we don't leak loop
4070     return -1;
4071 }
4072
4073 static int convert_strings(struct fa *fa) {
4074     struct state_set *worklist = state_set_init(-1, S_NONE);
4075     int result = -1;
4076
4077     E(worklist == NULL);
4078
4079     /* Abuse hash to count indegree, reachable to mark visited states */
4080     list_for_each(s, fa->initial) {
4081         s->hash = 0;
4082         s->reachable = 0;
4083     }
4084
4085     list_for_each(s, fa->initial) {
4086         for_each_trans(t, s) {
4087             t->to->hash += 1;
4088         }
4089     }
4090
4091     for (struct state *s = fa->initial;
4092          s != NULL;
4093          s = state_set_pop(worklist)) {
4094         for (int i=0; i < s->tused; i++) {
4095             struct trans *t = s->trans + i;
4096             struct state *to = t->to;
4097             while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4098                 if (t->re == NULL) {
4099                     t->re = to->trans->re;
4100                     to->trans->re = NULL;
4101                 } else {
4102                     t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4103                     if (t->re == NULL)
4104                         goto error;
4105                 }
4106                 t->to = to->trans->to;
4107                 to->tused = 0;
4108                 to->hash -= 1;
4109                 to = t->to;
4110                 for (int j=0; j < s->tused; j++) {
4111                     if (j != i && s->trans[j].to == to) {
4112                         /* Combine transitions i and j; remove trans j */
4113                         t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4114                         if (t->re == NULL)
4115                             goto error;
4116                         memmove(s->trans + j, s->trans + j + 1,
4117                                 sizeof(s->trans[j]) * (s->tused - j - 1));
4118                         to->hash -= 1;
4119                         s->tused -= 1;
4120                         if (j < i) {
4121                             i = i - 1;
4122                             t = s->trans + i;
4123                         }
4124                     }
4125                 }
4126             }
4127
4128             if (! to->reachable) {
4129                 to->reachable = 1;
4130                 F(state_set_push(worklist, to));
4131             }
4132         }
4133     }
4134
4135     for (struct state *s = fa->initial; s->next != NULL; ) {
4136         if (s->next->hash == 0 && s->next->tused == 0) {
4137             struct state *del = s->next;
4138             s->next = del->next;
4139             free(del->trans);
4140             free(del);
4141         } else {
4142             s = s->next;
4143         }
4144     }
4145     result = 0;
4146
4147  error:
4148     state_set_free(worklist);
4149     return result;
4150 }
4151
4152 /* Convert an FA to a regular expression.
4153  * The strategy is the following:
4154  * (1) For all states S1 and S2, convert the transitions between them
4155  *     into one transition whose regexp is a CSET
4156  * (2) Add a new initial state INI and a new final state FIN to the automaton,
4157  *     a transition from INI to the old initial state of FA, and a transition
4158  *     from all accepting states of FA to FIN; the regexp on those transitions
4159  *     matches only the empty word
4160  * (3) Eliminate states S (except for INI and FIN) one by one:
4161  *     Let LOOP the regexp for the transition S -> S if it exists, epsilon
4162  *     otherwise.
4163  *     For all S1, S2 different from S with S1 -> S -> S2
4164  *       Let R1 the regexp of S1 -> S
4165  *           R2 the regexp of S -> S2
4166  *           R3 the regexp S1 -> S2 (or epsilon if no such transition)
4167  *        set the regexp on the transition S1 -> S2 to
4168  *          R1 . (LOOP)* . R2 | R3
4169  * (4) The regexp for the whole FA can now be found as the regexp of
4170  *     the transition INI -> FIN
4171  * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4172  */
4173 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4174     int r;
4175     struct state *fin = NULL, *ini = NULL;
4176     struct re *eps = NULL;
4177
4178     *regexp = NULL;
4179     *regexp_len = 0;
4180     fa = fa_clone(fa);
4181     if (fa == NULL)
4182         goto error;
4183
4184     eps = make_re(EPSILON);
4185     if (eps == NULL)
4186         goto error;
4187
4188     fin = add_state(fa,1);
4189     if (fin == NULL)
4190         goto error;
4191
4192     fa->trans_re = 1;
4193
4194     list_for_each(s, fa->initial) {
4195         r = convert_trans_to_re(s);
4196         if (r < 0)
4197             goto error;
4198         if (s->accept && s != fin) {
4199             r = add_new_re_trans(s, fin, ref(eps));
4200             if (r < 0)
4201                 goto error;
4202             s->accept = 0;
4203         }
4204     }
4205
4206     ini = add_state(fa, 0);
4207     if (ini == NULL)
4208         goto error;
4209
4210     r = add_new_re_trans(ini, fa->initial, ref(eps));
4211     if (r < 0)
4212         goto error;
4213     set_initial(fa, ini);
4214
4215     convert_strings(fa);
4216
4217     list_for_each(s, fa->initial->next) {
4218         if (s == fin)
4219             continue;
4220         /* Eliminate S */
4221         struct re *loop = eps;
4222         for_each_trans(t, s) {
4223             if (t->to == s)
4224                 loop = t->re;
4225         }
4226         list_for_each(s1, fa->initial) {
4227             if (s == s1)
4228                 continue;
4229             for (int t1 = 0; t1 < s1->tused; t1++) {
4230                 if (s1->trans[t1].to == s) {
4231                     for (int t = 0; t < s->tused; t++) {
4232                         if (s->trans[t].to == s)
4233                             continue;
4234                         r = re_collapse_trans(s1, s->trans[t].to,
4235                                               s1->trans[t1].re,
4236                                               loop,
4237                                               s->trans[t].re);
4238                         if (r < 0)
4239                             goto error;
4240                     }
4241                 }
4242             }
4243         }
4244     }
4245
4246     re_unref(eps);
4247
4248     for_each_trans(t, fa->initial) {
4249         if (t->to == fin) {
4250             struct re_str str;
4251             MEMZERO(&str, 1);
4252             if (re_as_string(t->re, &str) < 0)
4253                 goto error;
4254             *regexp = str.rx;
4255             *regexp_len = str.len;
4256         }
4257     }
4258
4259     list_for_each(s, fa->initial) {
4260         for_each_trans(t, s) {
4261             unref(t->re, re);
4262         }
4263     }
4264     fa_free(fa);
4265
4266     return 0;
4267  error:
4268     fa_free(fa);
4269     re_unref(eps);
4270     return -1;
4271 }
4272
4273 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4274     int r1, r2;
4275     int result = 0;
4276
4277     switch(re->type) {
4278     case UNION:
4279     case CONCAT:
4280         r1 = re_restrict_alphabet(re->exp1, from, to);
4281         r2 = re_restrict_alphabet(re->exp2, from, to);
4282         result = (r1 != 0) ? r1 : r2;
4283         break;
4284     case CSET:
4285         if (re->negate) {
4286             re->negate = 0;
4287             bitset_negate(re->cset, UCHAR_NUM);
4288         }
4289         for (int i=from; i <= to; i++)
4290             bitset_clr(re->cset, i);
4291         break;
4292     case CHAR:
4293         if (from <= re->c && re->c <= to)
4294             result = -1;
4295         break;
4296     case ITER:
4297         result = re_restrict_alphabet(re->exp, from, to);
4298         break;
4299     case EPSILON:
4300         break;
4301     default:
4302         assert(0);
4303         abort();
4304         break;
4305     }
4306     return result;
4307 }
4308
4309 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4310                          char **newregexp, size_t *newregexp_len,
4311                          char from, char to) {
4312     int result;
4313     struct re *re = NULL;
4314     struct re_parse parse;
4315     struct re_str str;
4316
4317     *newregexp = NULL;
4318     MEMZERO(&parse, 1);
4319     parse.rx = regexp;
4320     parse.rend = regexp + regexp_len;
4321     parse.error = REG_NOERROR;
4322     re = parse_regexp(&parse);
4323     if (parse.error != REG_NOERROR)
4324         return parse.error;
4325
4326     result = re_restrict_alphabet(re, from, to);
4327     if (result != 0) {
4328         result = -2;
4329         goto done;
4330     }
4331
4332     MEMZERO(&str, 1);
4333     result = re_as_string(re, &str);
4334     *newregexp = str.rx;
4335     *newregexp_len = str.len;
4336  done:
4337     re_unref(re);
4338     return result;
4339 }
4340
4341 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4342                           char **newregexp, size_t *newregexp_len) {
4343     int result;
4344     struct re *re = NULL;
4345     struct re_parse parse;
4346     struct re_str str;
4347
4348     *newregexp = NULL;
4349     MEMZERO(&parse, 1);
4350     parse.rx = regexp;
4351     parse.rend = regexp + regexp_len;
4352     parse.error = REG_NOERROR;
4353     parse.no_ranges = 1;
4354     re = parse_regexp(&parse);
4355     if (parse.error != REG_NOERROR)
4356         return parse.error;
4357
4358     MEMZERO(&str, 1);
4359     result = re_as_string(re, &str);
4360     *newregexp = str.rx;
4361     *newregexp_len = str.len;
4362     re_unref(re);
4363     return result;
4364 }
4365
4366 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4367  *
4368  * Return 1 when a change was made, -1 when an allocation failed, and 0
4369  * when no change was made.
4370  */
4371 static int re_case_expand(struct re *re) {
4372     int result = 0, r1, r2;
4373
4374     switch(re->type) {
4375     case UNION:
4376     case CONCAT:
4377         r1 = re_case_expand(re->exp1);
4378         r2 = re_case_expand(re->exp2);
4379         result = (r1 != 0) ? r1 : r2;
4380         break;
4381     case CSET:
4382         for (int c = 'A'; c <= 'Z'; c++)
4383             if (bitset_get(re->cset, c)) {
4384                 result = 1;
4385                 bitset_set(re->cset, tolower(c));
4386             }
4387         for (int c = 'a'; c <= 'z'; c++)
4388             if (bitset_get(re->cset, c)) {
4389                 result = 1;
4390                 bitset_set(re->cset, toupper(c));
4391             }
4392         break;
4393     case CHAR:
4394         if (isalpha(re->c)) {
4395             int c = re->c;
4396             re->type = CSET;
4397             re->negate = false;
4398             re->no_ranges = 0;
4399             re->cset = bitset_init(UCHAR_NUM);
4400             if (re->cset == NULL)
4401                 return -1;
4402             bitset_set(re->cset, tolower(c));
4403             bitset_set(re->cset, toupper(c));
4404             result = 1;
4405         }
4406         break;
4407     case ITER:
4408         result = re_case_expand(re->exp);
4409         break;
4410     case EPSILON:
4411         break;
4412     default:
4413         assert(0);
4414         abort();
4415         break;
4416     }
4417     return result;
4418 }
4419
4420 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4421                      char **newregexp, size_t *newregexp_len) {
4422     int result, r;
4423     struct re *re = NULL;
4424     struct re_parse parse;
4425     struct re_str str;
4426
4427     *newregexp = NULL;
4428     MEMZERO(&parse, 1);
4429     parse.rx = regexp;
4430     parse.rend = regexp + regexp_len;
4431     parse.error = REG_NOERROR;
4432     re = parse_regexp(&parse);
4433     if (parse.error != REG_NOERROR)
4434         return parse.error;
4435
4436     r = re_case_expand(re);
4437     if (r < 0) {
4438         re_unref(re);
4439         return REG_ESPACE;
4440     }
4441
4442     if (r == 1) {
4443         MEMZERO(&str, 1);
4444         result = re_as_string(re, &str);
4445         *newregexp = str.rx;
4446         *newregexp_len = str.len;
4447     } else {
4448         *newregexp = strndup(regexp, regexp_len);
4449         *newregexp_len = regexp_len;
4450         result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4451     }
4452     re_unref(re);
4453     return result;
4454 }
4455
4456 static void print_char(FILE *out, uchar c) {
4457     /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4458        Also, a space ' ' is shown as '\s' */
4459     static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4460     static const char *const escape_to = "sntvbrfa/0";
4461     char *p = strchr(escape_from, c);
4462     if (p != NULL) {
4463         int i = p - escape_from;
4464         fprintf(out, "\\\\%c", escape_to[i]);
4465     } else if (! isprint(c)) {
4466         fprintf(out, "\\\\0%03o", (unsigned char) c);
4467     } else if (c == '"') {
4468         fprintf(out, "\\\"");
4469     } else {
4470         fputc(c, out);
4471     }
4472 }
4473
4474 void fa_dot(FILE *out, struct fa *fa) {
4475     fprintf(out, "digraph {\n  rankdir=LR;");
4476     list_for_each(s, fa->initial) {
4477         if (s->accept) {
4478             fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4479         } else {
4480             fprintf(out, "\"%p\" [shape=circle];\n", s);
4481         }
4482     }
4483     fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4484             fa->initial);
4485
4486     struct re_str str;
4487     MEMZERO(&str, 1);
4488     list_for_each(s, fa->initial) {
4489         for_each_trans(t, s) {
4490             fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4491             if (fa->trans_re) {
4492                 re_as_string(t->re, &str);
4493                 for (int i=0; i < str.len; i++) {
4494                     print_char(out, str.rx[i]);
4495                 }
4496                 release_re_str(&str);
4497             } else {
4498                 print_char(out, t->min);
4499                 if (t->min != t->max) {
4500                     fputc('-', out);
4501                     print_char(out, t->max);
4502                 }
4503             }
4504             fprintf(out, "\" ];\n");
4505         }
4506     }
4507     fprintf(out, "}\n");
4508 }
4509
4510 /*
4511  * Local variables:
4512  *  indent-tabs-mode: nil
4513  *  c-indent-level: 4
4514  *  c-basic-offset: 4
4515  *  tab-width: 4
4516  * End:
4517  */