win32/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the last subexpression parsed. */
 405         const char *start;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /*
 640                  * With REG_NEWLINE, POSIX requires that newlines are not matched by
 641                  * any form of a non-matching list.
 642                  */
 643                 if (ctx->cflags & REG_NEWLINE) {
 644                         lit = tre_new_lit(&ls);
 645                         if (!lit) {
 646                                 err = REG_ESPACE;
 647                                 goto parse_bracket_done;
 648                         }
 649                         lit->code_min = '\n';
 650                         lit->code_max = '\n';
 651                         lit->position = -1;
 652                 }
 653                 /* Sort the array if we need to negate it. */
 654                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 655                 /* extra lit for the last negated range */
 656                 lit = tre_new_lit(&ls);
 657                 if (!lit) {
 658                         err = REG_ESPACE;
 659                         goto parse_bracket_done;
 660                 }
 661                 lit->code_min = TRE_CHAR_MAX+1;
 662                 lit->code_max = TRE_CHAR_MAX+1;
 663                 lit->position = -1;
 664                 /* negated classes */
 665                 if (neg.len) {
 666                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 667                         if (!nc) {
 668                                 err = REG_ESPACE;
 669                                 goto parse_bracket_done;
 670                         }
 671                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 672                         nc[neg.len] = 0;
 673                 }
 674         }
 675
 676         /* Build a union of the items in the array, negated if necessary. */
 677         negmax = negmin = 0;
 678         for (i = 0; i < ls.len; i++) {
 679                 lit = ls.a[i];
 680                 min = lit->code_min;
 681                 max = lit->code_max;
 682                 if (neg.negate) {
 683                         if (min <= negmin) {
 684                                 /* Overlap. */
 685                                 negmin = MAX(max + 1, negmin);
 686                                 continue;
 687                         }
 688                         negmax = min - 1;
 689                         lit->code_min = negmin;
 690                         lit->code_max = negmax;
 691                         negmin = max + 1;
 692                 }
 693                 lit->position = ctx->position;
 694                 lit->neg_classes = nc;
 695                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 696                 node = tre_ast_new_union(ctx->mem, node, n);
 697                 if (!node) {
 698                         err = REG_ESPACE;
 699                         break;
 700                 }
 701         }
 702
 703 parse_bracket_done:
 704         xfree(ls.a);
 705         ctx->position++;
 706         ctx->n = node;
 707         return err;
 708 }
 709
 710 static const char *parse_dup_count(const char *s, int *n)
 711 {
 712         *n = -1;
 713         if (!isdigit(*s))
 714                 return s;
 715         *n = 0;
 716         for (;;) {
 717                 *n = 10 * *n + (*s - '0');
 718                 s++;
 719                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 720                         break;
 721         }
 722         return s;
 723 }
 724
 725 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
 726 {
 727         int min, max;
 728
 729         s = parse_dup_count(s, &min);
 730         if (*s == ',')
 731                 s = parse_dup_count(s+1, &max);
 732         else
 733                 max = min;
 734
 735         if (
 736                 (max < min && max >= 0) ||
 737                 max > RE_DUP_MAX ||
 738                 min > RE_DUP_MAX ||
 739                 min < 0 ||
 740                 (!ere && *s++ != '\\') ||
 741                 *s++ != '}'
 742         )
 743                 return 0;
 744         *pmin = min;
 745         *pmax = max;
 746         return s;
 747 }
 748
 749 static int hexval(unsigned c)
 750 {
 751         if (c-'0'<10) return c-'0';
 752         c |= 32;
 753         if (c-'a'<6) return c-'a'+10;
 754         return -1;
 755 }
 756
 757 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 758 {
 759         if (node->submatch_id >= 0) {
 760                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 761                 if (!n)
 762                         return REG_ESPACE;
 763                 n = tre_ast_new_catenation(ctx->mem, n, node);
 764                 if (!n)
 765                         return REG_ESPACE;
 766                 n->num_submatches = node->num_submatches;
 767                 node = n;
 768         }
 769         node->submatch_id = subid;
 770         node->num_submatches++;
 771         ctx->n = node;
 772         return REG_OK;
 773 }
 774
 775 /*
 776 BRE grammar:
 777 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 778 Branch =  Atom  |  Branch Atom
 779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 780 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 781
 782 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 783
 784 ERE grammar:
 785 Regex  =  Branch  |  Regex '|' Branch
 786 Branch =  Atom  |  Branch Atom
 787 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 788 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 789
 790 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 791 */
 792
 793 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 794 {
 795         int len, ere = ctx->cflags & REG_EXTENDED;
 796         const char *p;
 797         tre_ast_node_t *node;
 798         wchar_t wc;
 799         switch (*s) {
 800         case '[':
 801                 return parse_bracket(ctx, s+1);
 802         case '\\':
 803                 p = tre_expand_macro(s+1);
 804                 if (p) {
 805                         /* assume \X expansion is a single atom */
 806                         reg_errcode_t err = parse_atom(ctx, p);
 807                         ctx->s = s+2;
 808                         return err;
 809                 }
 810                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 811                 switch (*++s) {
 812                 case 0:
 813                         return REG_EESCAPE;
 814                 case 'b':
 815                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 816                         break;
 817                 case 'B':
 818                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 819                         break;
 820                 case '<':
 821                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 822                         break;
 823                 case '>':
 824                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 825                         break;
 826                 case 'x':
 827                         s++;
 828                         int i, v = 0, c;
 829                         len = 2;
 830                         if (*s == '{') {
 831                                 len = 8;
 832                                 s++;
 833                         }
 834                         for (i=0; i<len && v<0x110000; i++) {
 835                                 c = hexval(s[i]);
 836                                 if (c < 0) break;
 837                                 v = 16*v + c;
 838                         }
 839                         s += i;
 840                         if (len == 8) {
 841                                 if (*s != '}')
 842                                         return REG_EBRACE;
 843                                 s++;
 844                         }
 845                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
 846                         s--;
 847                         break;
 848                 case '{':
 849                 case '+':
 850                 case '?':
 851                         /* extension: treat \+, \? as repetitions in BRE */
 852                         /* reject repetitions after empty expression in BRE */
 853                         if (!ere)
 854                                 return REG_BADRPT;
 855                 case '|':
 856                         /* extension: treat \| as alternation in BRE */
 857                         if (!ere) {
 858                                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 859                                 s--;
 860                                 goto end;
 861                         }
 862                         /* fallthrough */
 863                 default:
 864                         if (!ere && (unsigned)*s-'1' < 9) {
 865                                 /* back reference */
 866                                 int val = *s - '0';
 867                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
 868                                 ctx->max_backref = MAX(val, ctx->max_backref);
 869                         } else {
 870                                 /* extension: accept unknown escaped char
 871                                    as a literal */
 872                                 goto parse_literal;
 873                         }
 874                 }
 875                 s++;
 876                 break;
 877         case '.':
 878                 if (ctx->cflags & REG_NEWLINE) {
 879                         tre_ast_node_t *tmp1, *tmp2;
 880                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 881                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 882                         if (tmp1 && tmp2)
 883                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 884                         else
 885                                 node = 0;
 886                 } else {
 887                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 888                 }
 889                 s++;
 890                 break;
 891         case '^':
 892                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 893                 if (!ere && s != ctx->start)
 894                         goto parse_literal;
 895                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 896                 s++;
 897                 break;
 898         case '$':
 899                 /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
 900                 if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
 901                         goto parse_literal;
 902                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 903                 s++;
 904                 break;
 905         case '*':
 906         case '{':
 907         case '+':
 908         case '?':
 909                 /* reject repetitions after empty expression in ERE */
 910                 if (ere)
 911                         return REG_BADRPT;
 912         case '|':
 913                 if (!ere)
 914                         goto parse_literal;
 915         case 0:
 916                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 917                 break;
 918         default:
 919 parse_literal:
 920                 len = mbtowc(&wc, s, -1);
 921                 if (len < 0)
 922                         return REG_BADPAT;
 923                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 924                         tre_ast_node_t *tmp1, *tmp2;
 925                         /* multiple opposite case characters are not supported */
 926                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 927                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 928                         if (tmp1 && tmp2)
 929                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 930                         else
 931                                 node = 0;
 932                 } else {
 933                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 934                 }
 935                 ctx->position++;
 936                 s += len;
 937                 break;
 938         }
 939 end:
 940         if (!node)
 941                 return REG_ESPACE;
 942         ctx->n = node;
 943         ctx->s = s;
 944         return REG_OK;
 945 }
 946
 947 #define PUSHPTR(err, s, v) do { \
 948         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 949                 return err; \
 950 } while(0)
 951
 952 #define PUSHINT(err, s, v) do { \
 953         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 954                 return err; \
 955 } while(0)
 956
 957 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 958 {
 959         tre_ast_node_t *nbranch=0, *nunion=0;
 960         int ere = ctx->cflags & REG_EXTENDED;
 961         const char *s = ctx->start;
 962         int subid = 0;
 963         int depth = 0;
 964         reg_errcode_t err;
 965         tre_stack_t *stack = ctx->stack;
 966
 967         PUSHINT(err, stack, subid++);
 968         for (;;) {
 969                 if ((!ere && *s == '\\' && s[1] == '(') ||
 970                     (ere && *s == '(')) {
 971                         PUSHPTR(err, stack, nunion);
 972                         PUSHPTR(err, stack, nbranch);
 973                         PUSHINT(err, stack, subid++);
 974                         s++;
 975                         if (!ere)
 976                                 s++;
 977                         depth++;
 978                         nbranch = nunion = 0;
 979                         ctx->start = s;
 980                         continue;
 981                 }
 982                 if ((!ere && *s == '\\' && s[1] == ')') ||
 983                     (ere && *s == ')' && depth)) {
 984                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 985                         if (!ctx->n)
 986                                 return REG_ESPACE;
 987                 } else {
 988                         err = parse_atom(ctx, s);
 989                         if (err != REG_OK)
 990                                 return err;
 991                         s = ctx->s;
 992                 }
 993
 994         parse_iter:
 995                 for (;;) {
 996                         int min, max;
 997
 998                         if (*s!='\\' && *s!='*') {
 999                                 if (!ere)
1000                                         break;
1001                                 if (*s!='+' && *s!='?' && *s!='{')
1002                                         break;
1003                         }
1004                         if (*s=='\\' && ere)
1005                                 break;
1006                         /* extension: treat \+, \? as repetitions in BRE */
1007                         if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1008                                 break;
1009                         if (*s=='\\')
1010                                 s++;
1011
1012                         /* handle ^* at the start of a BRE. */
1013                         if (!ere && s==ctx->start+1 && s[-1]=='^')
1014                                 break;
1015
1016                         /* extension: multiple consecutive *+?{,} is unspecified,
1017                            but (a+)+ has to be supported so accepting a++ makes
1018                            sense, note however that the RE_DUP_MAX limit can be
1019                            circumvented: (a{255}){255} uses a lot of memory.. */
1020                         if (*s=='{') {
1021                                 s = parse_dup(s+1, ere, &min, &max);
1022                                 if (!s)
1023                                         return REG_BADBR;
1024                         } else {
1025                                 min=0;
1026                                 max=-1;
1027                                 if (*s == '+')
1028                                         min = 1;
1029                                 if (*s == '?')
1030                                         max = 1;
1031                                 s++;
1032                         }
1033                         if (max == 0)
1034                                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1035                         else
1036                                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1037                         if (!ctx->n)
1038                                 return REG_ESPACE;
1039                 }
1040
1041                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1042                 if ((ere && *s == '|') ||
1043                     (ere && *s == ')' && depth) ||
1044                     (!ere && *s == '\\' && s[1] == ')') ||
1045                     /* extension: treat \| as alternation in BRE */
1046                     (!ere && *s == '\\' && s[1] == '|') ||
1047                     !*s) {
1048                         /* extension: empty branch is unspecified (), (|a), (a|)
1049                            here they are not rejected but match on empty string */
1050                         int c = *s;
1051                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1052                         nbranch = 0;
1053
1054                         if (c == '\\' && s[1] == '|') {
1055                                 s+=2;
1056                                 ctx->start = s;
1057                         } else if (c == '|') {
1058                                 s++;
1059                                 ctx->start = s;
1060                         } else {
1061                                 if (c == '\\') {
1062                                         if (!depth) return REG_EPAREN;
1063                                         s+=2;
1064                                 } else if (c == ')')
1065                                         s++;
1066                                 depth--;
1067                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1068                                 if (err != REG_OK)
1069                                         return err;
1070                                 if (!c && depth<0) {
1071                                         ctx->submatch_id = subid;
1072                                         return REG_OK;
1073                                 }
1074                                 if (!c || depth<0)
1075                                         return REG_EPAREN;
1076                                 nbranch = tre_stack_pop_voidptr(stack);
1077                                 nunion = tre_stack_pop_voidptr(stack);
1078                                 goto parse_iter;
1079                         }
1080                 }
1081         }
1082 }
1083
1084
1085 /***********************************************************************
1086  from tre-compile.c
1087 ***********************************************************************/
1088
1089
1090 /*
1091   TODO:
1092    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1093      function calls.
1094 */
1095
1096 /*
1097   Algorithms to setup tags so that submatch addressing can be done.
1098 */
1099
1100
1101 /* Inserts a catenation node to the root of the tree given in `node'.
1102    As the left child a new tag with number `tag_id' to `node' is added,
1103    and the right child is the old root. */
1104 static reg_errcode_t
1105 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1106 {
1107   tre_catenation_t *c;
1108
1109   c = tre_mem_alloc(mem, sizeof(*c));
1110   if (c == NULL)
1111     return REG_ESPACE;
1112   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1113   if (c->left == NULL)
1114     return REG_ESPACE;
1115   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1116   if (c->right == NULL)
1117     return REG_ESPACE;
1118
1119   c->right->obj = node->obj;
1120   c->right->type = node->type;
1121   c->right->nullable = -1;
1122   c->right->submatch_id = -1;
1123   c->right->firstpos = NULL;
1124   c->right->lastpos = NULL;
1125   c->right->num_tags = 0;
1126   c->right->num_submatches = 0;
1127   node->obj = c;
1128   node->type = CATENATION;
1129   return REG_OK;
1130 }
1131
1132 /* Inserts a catenation node to the root of the tree given in `node'.
1133    As the right child a new tag with number `tag_id' to `node' is added,
1134    and the left child is the old root. */
1135 static reg_errcode_t
1136 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1137 {
1138   tre_catenation_t *c;
1139
1140   c = tre_mem_alloc(mem, sizeof(*c));
1141   if (c == NULL)
1142     return REG_ESPACE;
1143   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1144   if (c->right == NULL)
1145     return REG_ESPACE;
1146   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1147   if (c->left == NULL)
1148     return REG_ESPACE;
1149
1150   c->left->obj = node->obj;
1151   c->left->type = node->type;
1152   c->left->nullable = -1;
1153   c->left->submatch_id = -1;
1154   c->left->firstpos = NULL;
1155   c->left->lastpos = NULL;
1156   c->left->num_tags = 0;
1157   c->left->num_submatches = 0;
1158   node->obj = c;
1159   node->type = CATENATION;
1160   return REG_OK;
1161 }
1162
1163 typedef enum {
1164   ADDTAGS_RECURSE,
1165   ADDTAGS_AFTER_ITERATION,
1166   ADDTAGS_AFTER_UNION_LEFT,
1167   ADDTAGS_AFTER_UNION_RIGHT,
1168   ADDTAGS_AFTER_CAT_LEFT,
1169   ADDTAGS_AFTER_CAT_RIGHT,
1170   ADDTAGS_SET_SUBMATCH_END
1171 } tre_addtags_symbol_t;
1172
1173
1174 typedef struct {
1175   int tag;
1176   int next_tag;
1177 } tre_tag_states_t;
1178
1179
1180 /* Go through `regset' and set submatch data for submatches that are
1181    using this tag. */
1182 static void
1183 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1184 {
1185   int i;
1186
1187   for (i = 0; regset[i] >= 0; i++)
1188     {
1189       int id = regset[i] / 2;
1190       int start = !(regset[i] % 2);
1191       if (start)
1192         tnfa->submatch_data[id].so_tag = tag;
1193       else
1194         tnfa->submatch_data[id].eo_tag = tag;
1195     }
1196   regset[0] = -1;
1197 }
1198
1199
1200 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1201    subexpressions marked for submatch addressing can be traced. */
1202 static reg_errcode_t
1203 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1204              tre_tnfa_t *tnfa)
1205 {
1206   reg_errcode_t status = REG_OK;
1207   tre_addtags_symbol_t symbol;
1208   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1209   int bottom = tre_stack_num_objects(stack);
1210   /* True for first pass (counting number of needed tags) */
1211   int first_pass = (mem == NULL || tnfa == NULL);
1212   int *regset, *orig_regset;
1213   int num_tags = 0; /* Total number of tags. */
1214   int num_minimals = 0;  /* Number of special minimal tags. */
1215   int tag = 0;      /* The tag that is to be added next. */
1216   int next_tag = 1; /* Next tag to use after this one. */
1217   int *parents;     /* Stack of submatches the current submatch is
1218                        contained in. */
1219   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1220   tre_tag_states_t *saved_states;
1221
1222   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1223   if (!first_pass)
1224     {
1225       tnfa->end_tag = 0;
1226       tnfa->minimal_tags[0] = -1;
1227     }
1228
1229   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1230   if (regset == NULL)
1231     return REG_ESPACE;
1232   regset[0] = -1;
1233   orig_regset = regset;
1234
1235   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1236   if (parents == NULL)
1237     {
1238       xfree(regset);
1239       return REG_ESPACE;
1240     }
1241   parents[0] = -1;
1242
1243   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1244   if (saved_states == NULL)
1245     {
1246       xfree(regset);
1247       xfree(parents);
1248       return REG_ESPACE;
1249     }
1250   else
1251     {
1252       unsigned int i;
1253       for (i = 0; i <= tnfa->num_submatches; i++)
1254         saved_states[i].tag = -1;
1255     }
1256
1257   STACK_PUSH(stack, voidptr, node);
1258   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1259
1260   while (tre_stack_num_objects(stack) > bottom)
1261     {
1262       if (status != REG_OK)
1263         break;
1264
1265       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1266       switch (symbol)
1267         {
1268
1269         case ADDTAGS_SET_SUBMATCH_END:
1270           {
1271             int id = tre_stack_pop_int(stack);
1272             int i;
1273
1274             /* Add end of this submatch to regset. */
1275             for (i = 0; regset[i] >= 0; i++);
1276             regset[i] = id * 2 + 1;
1277             regset[i + 1] = -1;
1278
1279             /* Pop this submatch from the parents stack. */
1280             for (i = 0; parents[i] >= 0; i++);
1281             parents[i - 1] = -1;
1282             break;
1283           }
1284
1285         case ADDTAGS_RECURSE:
1286           node = tre_stack_pop_voidptr(stack);
1287
1288           if (node->submatch_id >= 0)
1289             {
1290               int id = node->submatch_id;
1291               int i;
1292
1293
1294               /* Add start of this submatch to regset. */
1295               for (i = 0; regset[i] >= 0; i++);
1296               regset[i] = id * 2;
1297               regset[i + 1] = -1;
1298
1299               if (!first_pass)
1300                 {
1301                   for (i = 0; parents[i] >= 0; i++);
1302                   tnfa->submatch_data[id].parents = NULL;
1303                   if (i > 0)
1304                     {
1305                       int *p = xmalloc(sizeof(*p) * (i + 1));
1306                       if (p == NULL)
1307                         {
1308                           status = REG_ESPACE;
1309                           break;
1310                         }
1311                       assert(tnfa->submatch_data[id].parents == NULL);
1312                       tnfa->submatch_data[id].parents = p;
1313                       for (i = 0; parents[i] >= 0; i++)
1314                         p[i] = parents[i];
1315                       p[i] = -1;
1316                     }
1317                 }
1318
1319               /* Add end of this submatch to regset after processing this
1320                  node. */
1321               STACK_PUSHX(stack, int, node->submatch_id);
1322               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1323             }
1324
1325           switch (node->type)
1326             {
1327             case LITERAL:
1328               {
1329                 tre_literal_t *lit = node->obj;
1330
1331                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1332                   {
1333                     int i;
1334                     if (regset[0] >= 0)
1335                       {
1336                         /* Regset is not empty, so add a tag before the
1337                            literal or backref. */
1338                         if (!first_pass)
1339                           {
1340                             status = tre_add_tag_left(mem, node, tag);
1341                             tnfa->tag_directions[tag] = direction;
1342                             if (minimal_tag >= 0)
1343                               {
1344                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1345                                 tnfa->minimal_tags[i] = tag;
1346                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1347                                 tnfa->minimal_tags[i + 2] = -1;
1348                                 minimal_tag = -1;
1349                                 num_minimals++;
1350                               }
1351                             tre_purge_regset(regset, tnfa, tag);
1352                           }
1353                         else
1354                           {
1355                             node->num_tags = 1;
1356                           }
1357
1358                         regset[0] = -1;
1359                         tag = next_tag;
1360                         num_tags++;
1361                         next_tag++;
1362                       }
1363                   }
1364                 else
1365                   {
1366                     assert(!IS_TAG(lit));
1367                   }
1368                 break;
1369               }
1370             case CATENATION:
1371               {
1372                 tre_catenation_t *cat = node->obj;
1373                 tre_ast_node_t *left = cat->left;
1374                 tre_ast_node_t *right = cat->right;
1375                 int reserved_tag = -1;
1376
1377
1378                 /* After processing right child. */
1379                 STACK_PUSHX(stack, voidptr, node);
1380                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1381
1382                 /* Process right child. */
1383                 STACK_PUSHX(stack, voidptr, right);
1384                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385
1386                 /* After processing left child. */
1387                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1388                 if (left->num_tags > 0 && right->num_tags > 0)
1389                   {
1390                     /* Reserve the next tag to the right child. */
1391                     reserved_tag = next_tag;
1392                     next_tag++;
1393                   }
1394                 STACK_PUSHX(stack, int, reserved_tag);
1395                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1396
1397                 /* Process left child. */
1398                 STACK_PUSHX(stack, voidptr, left);
1399                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1400
1401                 }
1402               break;
1403             case ITERATION:
1404               {
1405                 tre_iteration_t *iter = node->obj;
1406
1407                 if (first_pass)
1408                   {
1409                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1410                   }
1411                 else
1412                   {
1413                     STACK_PUSHX(stack, int, tag);
1414                     STACK_PUSHX(stack, int, iter->minimal);
1415                   }
1416                 STACK_PUSHX(stack, voidptr, node);
1417                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1418
1419                 STACK_PUSHX(stack, voidptr, iter->arg);
1420                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1421
1422                 /* Regset is not empty, so add a tag here. */
1423                 if (regset[0] >= 0 || iter->minimal)
1424                   {
1425                     if (!first_pass)
1426                       {
1427                         int i;
1428                         status = tre_add_tag_left(mem, node, tag);
1429                         if (iter->minimal)
1430                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1431                         else
1432                           tnfa->tag_directions[tag] = direction;
1433                         if (minimal_tag >= 0)
1434                           {
1435                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1436                             tnfa->minimal_tags[i] = tag;
1437                             tnfa->minimal_tags[i + 1] = minimal_tag;
1438                             tnfa->minimal_tags[i + 2] = -1;
1439                             minimal_tag = -1;
1440                             num_minimals++;
1441                           }
1442                         tre_purge_regset(regset, tnfa, tag);
1443                       }
1444
1445                     regset[0] = -1;
1446                     tag = next_tag;
1447                     num_tags++;
1448                     next_tag++;
1449                   }
1450                 direction = TRE_TAG_MINIMIZE;
1451               }
1452               break;
1453             case UNION:
1454               {
1455                 tre_union_t *uni = node->obj;
1456                 tre_ast_node_t *left = uni->left;
1457                 tre_ast_node_t *right = uni->right;
1458                 int left_tag;
1459                 int right_tag;
1460
1461                 if (regset[0] >= 0)
1462                   {
1463                     left_tag = next_tag;
1464                     right_tag = next_tag + 1;
1465                   }
1466                 else
1467                   {
1468                     left_tag = tag;
1469                     right_tag = next_tag;
1470                   }
1471
1472                 /* After processing right child. */
1473                 STACK_PUSHX(stack, int, right_tag);
1474                 STACK_PUSHX(stack, int, left_tag);
1475                 STACK_PUSHX(stack, voidptr, regset);
1476                 STACK_PUSHX(stack, int, regset[0] >= 0);
1477                 STACK_PUSHX(stack, voidptr, node);
1478                 STACK_PUSHX(stack, voidptr, right);
1479                 STACK_PUSHX(stack, voidptr, left);
1480                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1481
1482                 /* Process right child. */
1483                 STACK_PUSHX(stack, voidptr, right);
1484                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1485
1486                 /* After processing left child. */
1487                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1488
1489                 /* Process left child. */
1490                 STACK_PUSHX(stack, voidptr, left);
1491                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1492
1493                 /* Regset is not empty, so add a tag here. */
1494                 if (regset[0] >= 0)
1495                   {
1496                     if (!first_pass)
1497                       {
1498                         int i;
1499                         status = tre_add_tag_left(mem, node, tag);
1500                         tnfa->tag_directions[tag] = direction;
1501                         if (minimal_tag >= 0)
1502                           {
1503                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1504                             tnfa->minimal_tags[i] = tag;
1505                             tnfa->minimal_tags[i + 1] = minimal_tag;
1506                             tnfa->minimal_tags[i + 2] = -1;
1507                             minimal_tag = -1;
1508                             num_minimals++;
1509                           }
1510                         tre_purge_regset(regset, tnfa, tag);
1511                       }
1512
1513                     regset[0] = -1;
1514                     tag = next_tag;
1515                     num_tags++;
1516                     next_tag++;
1517                   }
1518
1519                 if (node->num_submatches > 0)
1520                   {
1521                     /* The next two tags are reserved for markers. */
1522                     next_tag++;
1523                     tag = next_tag;
1524                     next_tag++;
1525                   }
1526
1527                 break;
1528               }
1529             }
1530
1531           if (node->submatch_id >= 0)
1532             {
1533               int i;
1534               /* Push this submatch on the parents stack. */
1535               for (i = 0; parents[i] >= 0; i++);
1536               parents[i] = node->submatch_id;
1537               parents[i + 1] = -1;
1538             }
1539
1540           break; /* end case: ADDTAGS_RECURSE */
1541
1542         case ADDTAGS_AFTER_ITERATION:
1543           {
1544             int minimal = 0;
1545             int enter_tag;
1546             node = tre_stack_pop_voidptr(stack);
1547             if (first_pass)
1548               {
1549                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1550                   + tre_stack_pop_int(stack);
1551                 minimal_tag = -1;
1552               }
1553             else
1554               {
1555                 minimal = tre_stack_pop_int(stack);
1556                 enter_tag = tre_stack_pop_int(stack);
1557                 if (minimal)
1558                   minimal_tag = enter_tag;
1559               }
1560
1561             if (!first_pass)
1562               {
1563                 if (minimal)
1564                   direction = TRE_TAG_MINIMIZE;
1565                 else
1566                   direction = TRE_TAG_MAXIMIZE;
1567               }
1568             break;
1569           }
1570
1571         case ADDTAGS_AFTER_CAT_LEFT:
1572           {
1573             int new_tag = tre_stack_pop_int(stack);
1574             next_tag = tre_stack_pop_int(stack);
1575             if (new_tag >= 0)
1576               {
1577                 tag = new_tag;
1578               }
1579             break;
1580           }
1581
1582         case ADDTAGS_AFTER_CAT_RIGHT:
1583           node = tre_stack_pop_voidptr(stack);
1584           if (first_pass)
1585             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1586               + ((tre_catenation_t *)node->obj)->right->num_tags;
1587           break;
1588
1589         case ADDTAGS_AFTER_UNION_LEFT:
1590           /* Lift the bottom of the `regset' array so that when processing
1591              the right operand the items currently in the array are
1592              invisible.  The original bottom was saved at ADDTAGS_UNION and
1593              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1594           while (*regset >= 0)
1595             regset++;
1596           break;
1597
1598         case ADDTAGS_AFTER_UNION_RIGHT:
1599           {
1600             int added_tags, tag_left, tag_right;
1601             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1602             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1603             node = tre_stack_pop_voidptr(stack);
1604             added_tags = tre_stack_pop_int(stack);
1605             if (first_pass)
1606               {
1607                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1608                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1609                   + ((node->num_submatches > 0) ? 2 : 0);
1610               }
1611             regset = tre_stack_pop_voidptr(stack);
1612             tag_left = tre_stack_pop_int(stack);
1613             tag_right = tre_stack_pop_int(stack);
1614
1615             /* Add tags after both children, the left child gets a smaller
1616                tag than the right child.  This guarantees that we prefer
1617                the left child over the right child. */
1618             /* XXX - This is not always necessary (if the children have
1619                tags which must be seen for every match of that child). */
1620             /* XXX - Check if this is the only place where tre_add_tag_right
1621                is used.  If so, use tre_add_tag_left (putting the tag before
1622                the child as opposed after the child) and throw away
1623                tre_add_tag_right. */
1624             if (node->num_submatches > 0)
1625               {
1626                 if (!first_pass)
1627                   {
1628                     status = tre_add_tag_right(mem, left, tag_left);
1629                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1630                     if (status == REG_OK)
1631                       status = tre_add_tag_right(mem, right, tag_right);
1632                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1633                   }
1634                 num_tags += 2;
1635               }
1636             direction = TRE_TAG_MAXIMIZE;
1637             break;
1638           }
1639
1640         default:
1641           assert(0);
1642           break;
1643
1644         } /* end switch(symbol) */
1645     } /* end while(tre_stack_num_objects(stack) > bottom) */
1646
1647   if (!first_pass)
1648     tre_purge_regset(regset, tnfa, tag);
1649
1650   if (!first_pass && minimal_tag >= 0)
1651     {
1652       int i;
1653       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1654       tnfa->minimal_tags[i] = tag;
1655       tnfa->minimal_tags[i + 1] = minimal_tag;
1656       tnfa->minimal_tags[i + 2] = -1;
1657       minimal_tag = -1;
1658       num_minimals++;
1659     }
1660
1661   assert(tree->num_tags == num_tags);
1662   tnfa->end_tag = num_tags;
1663   tnfa->num_tags = num_tags;
1664   tnfa->num_minimals = num_minimals;
1665   xfree(orig_regset);
1666   xfree(parents);
1667   xfree(saved_states);
1668   return status;
1669 }
1670
1671
1672
1673 /*
1674   AST to TNFA compilation routines.
1675 */
1676
1677 typedef enum {
1678   COPY_RECURSE,
1679   COPY_SET_RESULT_PTR
1680 } tre_copyast_symbol_t;
1681
1682 /* Flags for tre_copy_ast(). */
1683 #define COPY_REMOVE_TAGS         1
1684 #define COPY_MAXIMIZE_FIRST_TAG  2
1685
1686 static reg_errcode_t
1687 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1688              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1689              tre_ast_node_t **copy, int *max_pos)
1690 {
1691   reg_errcode_t status = REG_OK;
1692   int bottom = tre_stack_num_objects(stack);
1693   int num_copied = 0;
1694   int first_tag = 1;
1695   tre_ast_node_t **result = copy;
1696   tre_copyast_symbol_t symbol;
1697
1698   STACK_PUSH(stack, voidptr, ast);
1699   STACK_PUSH(stack, int, COPY_RECURSE);
1700
1701   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1702     {
1703       tre_ast_node_t *node;
1704       if (status != REG_OK)
1705         break;
1706
1707       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1708       switch (symbol)
1709         {
1710         case COPY_SET_RESULT_PTR:
1711           result = tre_stack_pop_voidptr(stack);
1712           break;
1713         case COPY_RECURSE:
1714           node = tre_stack_pop_voidptr(stack);
1715           switch (node->type)
1716             {
1717             case LITERAL:
1718               {
1719                 tre_literal_t *lit = node->obj;
1720                 int pos = lit->position;
1721                 int min = lit->code_min;
1722                 int max = lit->code_max;
1723                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1724                   {
1725                     /* XXX - e.g. [ab] has only one position but two
1726                        nodes, so we are creating holes in the state space
1727                        here.  Not fatal, just wastes memory. */
1728                     pos += *pos_add;
1729                     num_copied++;
1730                   }
1731                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1732                   {
1733                     /* Change this tag to empty. */
1734                     min = EMPTY;
1735                     max = pos = -1;
1736                   }
1737                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1738                          && first_tag)
1739                   {
1740                     /* Maximize the first tag. */
1741                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1742                     first_tag = 0;
1743                   }
1744                 *result = tre_ast_new_literal(mem, min, max, pos);
1745                 if (*result == NULL)
1746                   status = REG_ESPACE;
1747                 else {
1748                   tre_literal_t *p = (*result)->obj;
1749                   p->class = lit->class;
1750                   p->neg_classes = lit->neg_classes;
1751                 }
1752
1753                 if (pos > *max_pos)
1754                   *max_pos = pos;
1755                 break;
1756               }
1757             case UNION:
1758               {
1759                 tre_union_t *uni = node->obj;
1760                 tre_union_t *tmp;
1761                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1762                 if (*result == NULL)
1763                   {
1764                     status = REG_ESPACE;
1765                     break;
1766                   }
1767                 tmp = (*result)->obj;
1768                 result = &tmp->left;
1769                 STACK_PUSHX(stack, voidptr, uni->right);
1770                 STACK_PUSHX(stack, int, COPY_RECURSE);
1771                 STACK_PUSHX(stack, voidptr, &tmp->right);
1772                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773                 STACK_PUSHX(stack, voidptr, uni->left);
1774                 STACK_PUSHX(stack, int, COPY_RECURSE);
1775                 break;
1776               }
1777             case CATENATION:
1778               {
1779                 tre_catenation_t *cat = node->obj;
1780                 tre_catenation_t *tmp;
1781                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1782                 if (*result == NULL)
1783                   {
1784                     status = REG_ESPACE;
1785                     break;
1786                   }
1787                 tmp = (*result)->obj;
1788                 tmp->left = NULL;
1789                 tmp->right = NULL;
1790                 result = &tmp->left;
1791
1792                 STACK_PUSHX(stack, voidptr, cat->right);
1793                 STACK_PUSHX(stack, int, COPY_RECURSE);
1794                 STACK_PUSHX(stack, voidptr, &tmp->right);
1795                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1796                 STACK_PUSHX(stack, voidptr, cat->left);
1797                 STACK_PUSHX(stack, int, COPY_RECURSE);
1798                 break;
1799               }
1800             case ITERATION:
1801               {
1802                 tre_iteration_t *iter = node->obj;
1803                 STACK_PUSHX(stack, voidptr, iter->arg);
1804                 STACK_PUSHX(stack, int, COPY_RECURSE);
1805                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1806                                            iter->max, iter->minimal);
1807                 if (*result == NULL)
1808                   {
1809                     status = REG_ESPACE;
1810                     break;
1811                   }
1812                 iter = (*result)->obj;
1813                 result = &iter->arg;
1814                 break;
1815               }
1816             default:
1817               assert(0);
1818               break;
1819             }
1820           break;
1821         }
1822     }
1823   *pos_add += num_copied;
1824   return status;
1825 }
1826
1827 typedef enum {
1828   EXPAND_RECURSE,
1829   EXPAND_AFTER_ITER
1830 } tre_expand_ast_symbol_t;
1831
1832 /* Expands each iteration node that has a finite nonzero minimum or maximum
1833    iteration count to a catenated sequence of copies of the node. */
1834 static reg_errcode_t
1835 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1836                int *position, tre_tag_direction_t *tag_directions)
1837 {
1838   reg_errcode_t status = REG_OK;
1839   int bottom = tre_stack_num_objects(stack);
1840   int pos_add = 0;
1841   int pos_add_total = 0;
1842   int max_pos = 0;
1843   int iter_depth = 0;
1844
1845   STACK_PUSHR(stack, voidptr, ast);
1846   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1847   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1848     {
1849       tre_ast_node_t *node;
1850       tre_expand_ast_symbol_t symbol;
1851
1852       if (status != REG_OK)
1853         break;
1854
1855       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1856       node = tre_stack_pop_voidptr(stack);
1857       switch (symbol)
1858         {
1859         case EXPAND_RECURSE:
1860           switch (node->type)
1861             {
1862             case LITERAL:
1863               {
1864                 tre_literal_t *lit= node->obj;
1865                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1866                   {
1867                     lit->position += pos_add;
1868                     if (lit->position > max_pos)
1869                       max_pos = lit->position;
1870                   }
1871                 break;
1872               }
1873             case UNION:
1874               {
1875                 tre_union_t *uni = node->obj;
1876                 STACK_PUSHX(stack, voidptr, uni->right);
1877                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1878                 STACK_PUSHX(stack, voidptr, uni->left);
1879                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1880                 break;
1881               }
1882             case CATENATION:
1883               {
1884                 tre_catenation_t *cat = node->obj;
1885                 STACK_PUSHX(stack, voidptr, cat->right);
1886                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1887                 STACK_PUSHX(stack, voidptr, cat->left);
1888                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1889                 break;
1890               }
1891             case ITERATION:
1892               {
1893                 tre_iteration_t *iter = node->obj;
1894                 STACK_PUSHX(stack, int, pos_add);
1895                 STACK_PUSHX(stack, voidptr, node);
1896                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1897                 STACK_PUSHX(stack, voidptr, iter->arg);
1898                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1899                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1900                    then don't increase the `pos' fields of the nodes now, it
1901                    will get done when expanding. */
1902                 if (iter->min > 1 || iter->max > 1)
1903                   pos_add = 0;
1904                 iter_depth++;
1905                 break;
1906               }
1907             default:
1908               assert(0);
1909               break;
1910             }
1911           break;
1912         case EXPAND_AFTER_ITER:
1913           {
1914             tre_iteration_t *iter = node->obj;
1915             int pos_add_last;
1916             pos_add = tre_stack_pop_int(stack);
1917             pos_add_last = pos_add;
1918             if (iter->min > 1 || iter->max > 1)
1919               {
1920                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1921                 int j;
1922                 int pos_add_save = pos_add;
1923
1924                 /* Create a catenated sequence of copies of the node. */
1925                 for (j = 0; j < iter->min; j++)
1926                   {
1927                     tre_ast_node_t *copy;
1928                     /* Remove tags from all but the last copy. */
1929                     int flags = ((j + 1 < iter->min)
1930                                  ? COPY_REMOVE_TAGS
1931                                  : COPY_MAXIMIZE_FIRST_TAG);
1932                     pos_add_save = pos_add;
1933                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1934                                           &pos_add, tag_directions, &copy,
1935                                           &max_pos);
1936                     if (status != REG_OK)
1937                       return status;
1938                     if (seq1 != NULL)
1939                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1940                     else
1941                       seq1 = copy;
1942                     if (seq1 == NULL)
1943                       return REG_ESPACE;
1944                   }
1945
1946                 if (iter->max == -1)
1947                   {
1948                     /* No upper limit. */
1949                     pos_add_save = pos_add;
1950                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1951                                           &pos_add, NULL, &seq2, &max_pos);
1952                     if (status != REG_OK)
1953                       return status;
1954                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1955                     if (seq2 == NULL)
1956                       return REG_ESPACE;
1957                   }
1958                 else
1959                   {
1960                     for (j = iter->min; j < iter->max; j++)
1961                       {
1962                         tre_ast_node_t *tmp, *copy;
1963                         pos_add_save = pos_add;
1964                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1965                                               &pos_add, NULL, &copy, &max_pos);
1966                         if (status != REG_OK)
1967                           return status;
1968                         if (seq2 != NULL)
1969                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1970                         else
1971                           seq2 = copy;
1972                         if (seq2 == NULL)
1973                           return REG_ESPACE;
1974                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1975                         if (tmp == NULL)
1976                           return REG_ESPACE;
1977                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1978                         if (seq2 == NULL)
1979                           return REG_ESPACE;
1980                       }
1981                   }
1982
1983                 pos_add = pos_add_save;
1984                 if (seq1 == NULL)
1985                   seq1 = seq2;
1986                 else if (seq2 != NULL)
1987                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1988                 if (seq1 == NULL)
1989                   return REG_ESPACE;
1990                 node->obj = seq1->obj;
1991                 node->type = seq1->type;
1992               }
1993
1994             iter_depth--;
1995             pos_add_total += pos_add - pos_add_last;
1996             if (iter_depth == 0)
1997               pos_add = pos_add_total;
1998
1999             break;
2000           }
2001         default:
2002           assert(0);
2003           break;
2004         }
2005     }
2006
2007   *position += pos_add_total;
2008
2009   /* `max_pos' should never be larger than `*position' if the above
2010      code works, but just an extra safeguard let's make sure
2011      `*position' is set large enough so enough memory will be
2012      allocated for the transition table. */
2013   if (max_pos > *position)
2014     *position = max_pos;
2015
2016   return status;
2017 }
2018
2019 static tre_pos_and_tags_t *
2020 tre_set_empty(tre_mem_t mem)
2021 {
2022   tre_pos_and_tags_t *new_set;
2023
2024   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2025   if (new_set == NULL)
2026     return NULL;
2027
2028   new_set[0].position = -1;
2029   new_set[0].code_min = -1;
2030   new_set[0].code_max = -1;
2031
2032   return new_set;
2033 }
2034
2035 static tre_pos_and_tags_t *
2036 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2037             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2038 {
2039   tre_pos_and_tags_t *new_set;
2040
2041   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2042   if (new_set == NULL)
2043     return NULL;
2044
2045   new_set[0].position = position;
2046   new_set[0].code_min = code_min;
2047   new_set[0].code_max = code_max;
2048   new_set[0].class = class;
2049   new_set[0].neg_classes = neg_classes;
2050   new_set[0].backref = backref;
2051   new_set[1].position = -1;
2052   new_set[1].code_min = -1;
2053   new_set[1].code_max = -1;
2054
2055   return new_set;
2056 }
2057
2058 static tre_pos_and_tags_t *
2059 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2060               int *tags, int assertions)
2061 {
2062   int s1, s2, i, j;
2063   tre_pos_and_tags_t *new_set;
2064   int *new_tags;
2065   int num_tags;
2066
2067   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2068   for (s1 = 0; set1[s1].position >= 0; s1++);
2069   for (s2 = 0; set2[s2].position >= 0; s2++);
2070   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2071   if (!new_set )
2072     return NULL;
2073
2074   for (s1 = 0; set1[s1].position >= 0; s1++)
2075     {
2076       new_set[s1].position = set1[s1].position;
2077       new_set[s1].code_min = set1[s1].code_min;
2078       new_set[s1].code_max = set1[s1].code_max;
2079       new_set[s1].assertions = set1[s1].assertions | assertions;
2080       new_set[s1].class = set1[s1].class;
2081       new_set[s1].neg_classes = set1[s1].neg_classes;
2082       new_set[s1].backref = set1[s1].backref;
2083       if (set1[s1].tags == NULL && tags == NULL)
2084         new_set[s1].tags = NULL;
2085       else
2086         {
2087           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2088           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2089                                          * (i + num_tags + 1)));
2090           if (new_tags == NULL)
2091             return NULL;
2092           for (j = 0; j < i; j++)
2093             new_tags[j] = set1[s1].tags[j];
2094           for (i = 0; i < num_tags; i++)
2095             new_tags[j + i] = tags[i];
2096           new_tags[j + i] = -1;
2097           new_set[s1].tags = new_tags;
2098         }
2099     }
2100
2101   for (s2 = 0; set2[s2].position >= 0; s2++)
2102     {
2103       new_set[s1 + s2].position = set2[s2].position;
2104       new_set[s1 + s2].code_min = set2[s2].code_min;
2105       new_set[s1 + s2].code_max = set2[s2].code_max;
2106       /* XXX - why not | assertions here as well? */
2107       new_set[s1 + s2].assertions = set2[s2].assertions;
2108       new_set[s1 + s2].class = set2[s2].class;
2109       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2110       new_set[s1 + s2].backref = set2[s2].backref;
2111       if (set2[s2].tags == NULL)
2112         new_set[s1 + s2].tags = NULL;
2113       else
2114         {
2115           for (i = 0; set2[s2].tags[i] >= 0; i++);
2116           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2117           if (new_tags == NULL)
2118             return NULL;
2119           for (j = 0; j < i; j++)
2120             new_tags[j] = set2[s2].tags[j];
2121           new_tags[j] = -1;
2122           new_set[s1 + s2].tags = new_tags;
2123         }
2124     }
2125   new_set[s1 + s2].position = -1;
2126   return new_set;
2127 }
2128
2129 /* Finds the empty path through `node' which is the one that should be
2130    taken according to POSIX.2 rules, and adds the tags on that path to
2131    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2132    set to the number of tags seen on the path. */
2133 static reg_errcode_t
2134 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2135                 int *assertions, int *num_tags_seen)
2136 {
2137   tre_literal_t *lit;
2138   tre_union_t *uni;
2139   tre_catenation_t *cat;
2140   tre_iteration_t *iter;
2141   int i;
2142   int bottom = tre_stack_num_objects(stack);
2143   reg_errcode_t status = REG_OK;
2144   if (num_tags_seen)
2145     *num_tags_seen = 0;
2146
2147   status = tre_stack_push_voidptr(stack, node);
2148
2149   /* Walk through the tree recursively. */
2150   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2151     {
2152       node = tre_stack_pop_voidptr(stack);
2153
2154       switch (node->type)
2155         {
2156         case LITERAL:
2157           lit = (tre_literal_t *)node->obj;
2158           switch (lit->code_min)
2159             {
2160             case TAG:
2161               if (lit->code_max >= 0)
2162                 {
2163                   if (tags != NULL)
2164                     {
2165                       /* Add the tag to `tags'. */
2166                       for (i = 0; tags[i] >= 0; i++)
2167                         if (tags[i] == lit->code_max)
2168                           break;
2169                       if (tags[i] < 0)
2170                         {
2171                           tags[i] = lit->code_max;
2172                           tags[i + 1] = -1;
2173                         }
2174                     }
2175                   if (num_tags_seen)
2176                     (*num_tags_seen)++;
2177                 }
2178               break;
2179             case ASSERTION:
2180               assert(lit->code_max >= 1
2181                      || lit->code_max <= ASSERT_LAST);
2182               if (assertions != NULL)
2183                 *assertions |= lit->code_max;
2184               break;
2185             case EMPTY:
2186               break;
2187             default:
2188               assert(0);
2189               break;
2190             }
2191           break;
2192
2193         case UNION:
2194           /* Subexpressions starting earlier take priority over ones
2195              starting later, so we prefer the left subexpression over the
2196              right subexpression. */
2197           uni = (tre_union_t *)node->obj;
2198           if (uni->left->nullable)
2199             STACK_PUSHX(stack, voidptr, uni->left)
2200           else if (uni->right->nullable)
2201             STACK_PUSHX(stack, voidptr, uni->right)
2202           else
2203             assert(0);
2204           break;
2205
2206         case CATENATION:
2207           /* The path must go through both children. */
2208           cat = (tre_catenation_t *)node->obj;
2209           assert(cat->left->nullable);
2210           assert(cat->right->nullable);
2211           STACK_PUSHX(stack, voidptr, cat->left);
2212           STACK_PUSHX(stack, voidptr, cat->right);
2213           break;
2214
2215         case ITERATION:
2216           /* A match with an empty string is preferred over no match at
2217              all, so we go through the argument if possible. */
2218           iter = (tre_iteration_t *)node->obj;
2219           if (iter->arg->nullable)
2220             STACK_PUSHX(stack, voidptr, iter->arg);
2221           break;
2222
2223         default:
2224           assert(0);
2225           break;
2226         }
2227     }
2228
2229   return status;
2230 }
2231
2232
2233 typedef enum {
2234   NFL_RECURSE,
2235   NFL_POST_UNION,
2236   NFL_POST_CATENATION,
2237   NFL_POST_ITERATION
2238 } tre_nfl_stack_symbol_t;
2239
2240
2241 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2242    the nodes of the AST `tree'. */
2243 static reg_errcode_t
2244 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2245 {
2246   int bottom = tre_stack_num_objects(stack);
2247
2248   STACK_PUSHR(stack, voidptr, tree);
2249   STACK_PUSHR(stack, int, NFL_RECURSE);
2250
2251   while (tre_stack_num_objects(stack) > bottom)
2252     {
2253       tre_nfl_stack_symbol_t symbol;
2254       tre_ast_node_t *node;
2255
2256       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2257       node = tre_stack_pop_voidptr(stack);
2258       switch (symbol)
2259         {
2260         case NFL_RECURSE:
2261           switch (node->type)
2262             {
2263             case LITERAL:
2264               {
2265                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2266                 if (IS_BACKREF(lit))
2267                   {
2268                     /* Back references: nullable = false, firstpos = {i},
2269                        lastpos = {i}. */
2270                     node->nullable = 0;
2271                     node->firstpos = tre_set_one(mem, lit->position, 0,
2272                                              TRE_CHAR_MAX, 0, NULL, -1);
2273                     if (!node->firstpos)
2274                       return REG_ESPACE;
2275                     node->lastpos = tre_set_one(mem, lit->position, 0,
2276                                                 TRE_CHAR_MAX, 0, NULL,
2277                                                 (int)lit->code_max);
2278                     if (!node->lastpos)
2279                       return REG_ESPACE;
2280                   }
2281                 else if (lit->code_min < 0)
2282                   {
2283                     /* Tags, empty strings, params, and zero width assertions:
2284                        nullable = true, firstpos = {}, and lastpos = {}. */
2285                     node->nullable = 1;
2286                     node->firstpos = tre_set_empty(mem);
2287                     if (!node->firstpos)
2288                       return REG_ESPACE;
2289                     node->lastpos = tre_set_empty(mem);
2290                     if (!node->lastpos)
2291                       return REG_ESPACE;
2292                   }
2293                 else
2294                   {
2295                     /* Literal at position i: nullable = false, firstpos = {i},
2296                        lastpos = {i}. */
2297                     node->nullable = 0;
2298                     node->firstpos =
2299                       tre_set_one(mem, lit->position, (int)lit->code_min,
2300                                   (int)lit->code_max, 0, NULL, -1);
2301                     if (!node->firstpos)
2302                       return REG_ESPACE;
2303                     node->lastpos = tre_set_one(mem, lit->position,
2304                                                 (int)lit->code_min,
2305                                                 (int)lit->code_max,
2306                                                 lit->class, lit->neg_classes,
2307                                                 -1);
2308                     if (!node->lastpos)
2309                       return REG_ESPACE;
2310                   }
2311                 break;
2312               }
2313
2314             case UNION:
2315               /* Compute the attributes for the two subtrees, and after that
2316                  for this node. */
2317               STACK_PUSHR(stack, voidptr, node);
2318               STACK_PUSHR(stack, int, NFL_POST_UNION);
2319               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2320               STACK_PUSHR(stack, int, NFL_RECURSE);
2321               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2322               STACK_PUSHR(stack, int, NFL_RECURSE);
2323               break;
2324
2325             case CATENATION:
2326               /* Compute the attributes for the two subtrees, and after that
2327                  for this node. */
2328               STACK_PUSHR(stack, voidptr, node);
2329               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2330               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2331               STACK_PUSHR(stack, int, NFL_RECURSE);
2332               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2333               STACK_PUSHR(stack, int, NFL_RECURSE);
2334               break;
2335
2336             case ITERATION:
2337               /* Compute the attributes for the subtree, and after that for
2338                  this node. */
2339               STACK_PUSHR(stack, voidptr, node);
2340               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2341               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2342               STACK_PUSHR(stack, int, NFL_RECURSE);
2343               break;
2344             }
2345           break; /* end case: NFL_RECURSE */
2346
2347         case NFL_POST_UNION:
2348           {
2349             tre_union_t *uni = (tre_union_t *)node->obj;
2350             node->nullable = uni->left->nullable || uni->right->nullable;
2351             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2352                                            uni->right->firstpos, NULL, 0);
2353             if (!node->firstpos)
2354               return REG_ESPACE;
2355             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2356                                           uni->right->lastpos, NULL, 0);
2357             if (!node->lastpos)
2358               return REG_ESPACE;
2359             break;
2360           }
2361
2362         case NFL_POST_ITERATION:
2363           {
2364             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2365
2366             if (iter->min == 0 || iter->arg->nullable)
2367               node->nullable = 1;
2368             else
2369               node->nullable = 0;
2370             node->firstpos = iter->arg->firstpos;
2371             node->lastpos = iter->arg->lastpos;
2372             break;
2373           }
2374
2375         case NFL_POST_CATENATION:
2376           {
2377             int num_tags, *tags, assertions;
2378             reg_errcode_t status;
2379             tre_catenation_t *cat = node->obj;
2380             node->nullable = cat->left->nullable && cat->right->nullable;
2381
2382             /* Compute firstpos. */
2383             if (cat->left->nullable)
2384               {
2385                 /* The left side matches the empty string.  Make a first pass
2386                    with tre_match_empty() to get the number of tags and
2387                    parameters. */
2388                 status = tre_match_empty(stack, cat->left,
2389                                          NULL, NULL, &num_tags);
2390                 if (status != REG_OK)
2391                   return status;
2392                 /* Allocate arrays for the tags and parameters. */
2393                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2394                 if (!tags)
2395                   return REG_ESPACE;
2396                 tags[0] = -1;
2397                 assertions = 0;
2398                 /* Second pass with tre_mach_empty() to get the list of
2399                    tags and parameters. */
2400                 status = tre_match_empty(stack, cat->left, tags,
2401                                          &assertions, NULL);
2402                 if (status != REG_OK)
2403                   {
2404                     xfree(tags);
2405                     return status;
2406                   }
2407                 node->firstpos =
2408                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2409                                 tags, assertions);
2410                 xfree(tags);
2411                 if (!node->firstpos)
2412                   return REG_ESPACE;
2413               }
2414             else
2415               {
2416                 node->firstpos = cat->left->firstpos;
2417               }
2418
2419             /* Compute lastpos. */
2420             if (cat->right->nullable)
2421               {
2422                 /* The right side matches the empty string.  Make a first pass
2423                    with tre_match_empty() to get the number of tags and
2424                    parameters. */
2425                 status = tre_match_empty(stack, cat->right,
2426                                          NULL, NULL, &num_tags);
2427                 if (status != REG_OK)
2428                   return status;
2429                 /* Allocate arrays for the tags and parameters. */
2430                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2431                 if (!tags)
2432                   return REG_ESPACE;
2433                 tags[0] = -1;
2434                 assertions = 0;
2435                 /* Second pass with tre_mach_empty() to get the list of
2436                    tags and parameters. */
2437                 status = tre_match_empty(stack, cat->right, tags,
2438                                          &assertions, NULL);
2439                 if (status != REG_OK)
2440                   {
2441                     xfree(tags);
2442                     return status;
2443                   }
2444                 node->lastpos =
2445                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2446                                 tags, assertions);
2447                 xfree(tags);
2448                 if (!node->lastpos)
2449                   return REG_ESPACE;
2450               }
2451             else
2452               {
2453                 node->lastpos = cat->right->lastpos;
2454               }
2455             break;
2456           }
2457
2458         default:
2459           assert(0);
2460           break;
2461         }
2462     }
2463
2464   return REG_OK;
2465 }
2466
2467
2468 /* Adds a transition from each position in `p1' to each position in `p2'. */
2469 static reg_errcode_t
2470 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2471                tre_tnfa_transition_t *transitions,
2472                int *counts, int *offs)
2473 {
2474   tre_pos_and_tags_t *orig_p2 = p2;
2475   tre_tnfa_transition_t *trans;
2476   int i, j, k, l, dup, prev_p2_pos;
2477
2478   if (transitions != NULL)
2479     while (p1->position >= 0)
2480       {
2481         p2 = orig_p2;
2482         prev_p2_pos = -1;
2483         while (p2->position >= 0)
2484           {
2485             /* Optimization: if this position was already handled, skip it. */
2486             if (p2->position == prev_p2_pos)
2487               {
2488                 p2++;
2489                 continue;
2490               }
2491             prev_p2_pos = p2->position;
2492             /* Set `trans' to point to the next unused transition from
2493                position `p1->position'. */
2494             trans = transitions + offs[p1->position];
2495             while (trans->state != NULL)
2496               {
2497 #if 0
2498                 /* If we find a previous transition from `p1->position' to
2499                    `p2->position', it is overwritten.  This can happen only
2500                    if there are nested loops in the regexp, like in "((a)*)*".
2501                    In POSIX.2 repetition using the outer loop is always
2502                    preferred over using the inner loop.  Therefore the
2503                    transition for the inner loop is useless and can be thrown
2504                    away. */
2505                 /* XXX - The same position is used for all nodes in a bracket
2506                    expression, so this optimization cannot be used (it will
2507                    break bracket expressions) unless I figure out a way to
2508                    detect it here. */
2509                 if (trans->state_id == p2->position)
2510                   {
2511                     break;
2512                   }
2513 #endif
2514                 trans++;
2515               }
2516
2517             if (trans->state == NULL)
2518               (trans + 1)->state = NULL;
2519             /* Use the character ranges, assertions, etc. from `p1' for
2520                the transition from `p1' to `p2'. */
2521             trans->code_min = p1->code_min;
2522             trans->code_max = p1->code_max;
2523             trans->state = transitions + offs[p2->position];
2524             trans->state_id = p2->position;
2525             trans->assertions = p1->assertions | p2->assertions
2526               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2527               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2528             if (p1->backref >= 0)
2529               {
2530                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2531                 assert(p2->backref < 0);
2532                 trans->u.backref = p1->backref;
2533                 trans->assertions |= ASSERT_BACKREF;
2534               }
2535             else
2536               trans->u.class = p1->class;
2537             if (p1->neg_classes != NULL)
2538               {
2539                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2540                 trans->neg_classes =
2541                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2542                 if (trans->neg_classes == NULL)
2543                   return REG_ESPACE;
2544                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2545                   trans->neg_classes[i] = p1->neg_classes[i];
2546                 trans->neg_classes[i] = (tre_ctype_t)0;
2547               }
2548             else
2549               trans->neg_classes = NULL;
2550
2551             /* Find out how many tags this transition has. */
2552             i = 0;
2553             if (p1->tags != NULL)
2554               while(p1->tags[i] >= 0)
2555                 i++;
2556             j = 0;
2557             if (p2->tags != NULL)
2558               while(p2->tags[j] >= 0)
2559                 j++;
2560
2561             /* If we are overwriting a transition, free the old tag array. */
2562             if (trans->tags != NULL)
2563               xfree(trans->tags);
2564             trans->tags = NULL;
2565
2566             /* If there were any tags, allocate an array and fill it. */
2567             if (i + j > 0)
2568               {
2569                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2570                 if (!trans->tags)
2571                   return REG_ESPACE;
2572                 i = 0;
2573                 if (p1->tags != NULL)
2574                   while(p1->tags[i] >= 0)
2575                     {
2576                       trans->tags[i] = p1->tags[i];
2577                       i++;
2578                     }
2579                 l = i;
2580                 j = 0;
2581                 if (p2->tags != NULL)
2582                   while (p2->tags[j] >= 0)
2583                     {
2584                       /* Don't add duplicates. */
2585                       dup = 0;
2586                       for (k = 0; k < i; k++)
2587                         if (trans->tags[k] == p2->tags[j])
2588                           {
2589                             dup = 1;
2590                             break;
2591                           }
2592                       if (!dup)
2593                         trans->tags[l++] = p2->tags[j];
2594                       j++;
2595                     }
2596                 trans->tags[l] = -1;
2597               }
2598
2599             p2++;
2600           }
2601         p1++;
2602       }
2603   else
2604     /* Compute a maximum limit for the number of transitions leaving
2605        from each state. */
2606     while (p1->position >= 0)
2607       {
2608         p2 = orig_p2;
2609         while (p2->position >= 0)
2610           {
2611             counts[p1->position]++;
2612             p2++;
2613           }
2614         p1++;
2615       }
2616   return REG_OK;
2617 }
2618
2619 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2620    labelled with one character range (there are no transitions on empty
2621    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2622    the regexp. */
2623 static reg_errcode_t
2624 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2625                 int *counts, int *offs)
2626 {
2627   tre_union_t *uni;
2628   tre_catenation_t *cat;
2629   tre_iteration_t *iter;
2630   reg_errcode_t errcode = REG_OK;
2631
2632   /* XXX - recurse using a stack!. */
2633   switch (node->type)
2634     {
2635     case LITERAL:
2636       break;
2637     case UNION:
2638       uni = (tre_union_t *)node->obj;
2639       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2640       if (errcode != REG_OK)
2641         return errcode;
2642       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2643       break;
2644
2645     case CATENATION:
2646       cat = (tre_catenation_t *)node->obj;
2647       /* Add a transition from each position in cat->left->lastpos
2648          to each position in cat->right->firstpos. */
2649       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2650                                transitions, counts, offs);
2651       if (errcode != REG_OK)
2652         return errcode;
2653       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2654       if (errcode != REG_OK)
2655         return errcode;
2656       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2657       break;
2658
2659     case ITERATION:
2660       iter = (tre_iteration_t *)node->obj;
2661       assert(iter->max == -1 || iter->max == 1);
2662
2663       if (iter->max == -1)
2664         {
2665           assert(iter->min == 0 || iter->min == 1);
2666           /* Add a transition from each last position in the iterated
2667              expression to each first position. */
2668           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2669                                    transitions, counts, offs);
2670           if (errcode != REG_OK)
2671             return errcode;
2672         }
2673       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2674       break;
2675     }
2676   return errcode;
2677 }
2678
2679
2680 #define ERROR_EXIT(err)           \
2681   do                              \
2682     {                             \
2683       errcode = err;              \
2684       if (/*CONSTCOND*/1)         \
2685         goto error_exit;          \
2686     }                             \
2687  while (/*CONSTCOND*/0)
2688
2689
2690 int
2691 regcomp(regex_t *__restrict preg, const char *__restrict regex, int cflags)
2692 {
2693   tre_stack_t *stack;
2694   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2695   tre_pos_and_tags_t *p;
2696   int *counts = NULL, *offs = NULL;
2697   int i, add = 0;
2698   tre_tnfa_transition_t *transitions, *initial;
2699   tre_tnfa_t *tnfa = NULL;
2700   tre_submatch_data_t *submatch_data;
2701   tre_tag_direction_t *tag_directions = NULL;
2702   reg_errcode_t errcode;
2703   tre_mem_t mem;
2704
2705   /* Parse context. */
2706   tre_parse_ctx_t parse_ctx;
2707
2708   /* Allocate a stack used throughout the compilation process for various
2709      purposes. */
2710   stack = tre_stack_new(512, 1024000, 128);
2711   if (!stack)
2712     return REG_ESPACE;
2713   /* Allocate a fast memory allocator. */
2714   mem = tre_mem_new();
2715   if (!mem)
2716     {
2717       tre_stack_destroy(stack);
2718       return REG_ESPACE;
2719     }
2720
2721   /* Parse the regexp. */
2722   memset(&parse_ctx, 0, sizeof(parse_ctx));
2723   parse_ctx.mem = mem;
2724   parse_ctx.stack = stack;
2725   parse_ctx.start = regex;
2726   parse_ctx.cflags = cflags;
2727   parse_ctx.max_backref = -1;
2728   errcode = tre_parse(&parse_ctx);
2729   if (errcode != REG_OK)
2730     ERROR_EXIT(errcode);
2731   preg->re_nsub = parse_ctx.submatch_id - 1;
2732   tree = parse_ctx.n;
2733
2734 #ifdef TRE_DEBUG
2735   tre_ast_print(tree);
2736 #endif /* TRE_DEBUG */
2737
2738   /* Referring to nonexistent subexpressions is illegal. */
2739   if (parse_ctx.max_backref > (int)preg->re_nsub)
2740     ERROR_EXIT(REG_ESUBREG);
2741
2742   /* Allocate the TNFA struct. */
2743   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2744   if (tnfa == NULL)
2745     ERROR_EXIT(REG_ESPACE);
2746   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2747   tnfa->have_approx = 0;
2748   tnfa->num_submatches = parse_ctx.submatch_id;
2749
2750   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2751      regexp does not have back references, this can be skipped. */
2752   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2753     {
2754
2755       /* Figure out how many tags we will need. */
2756       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2757       if (errcode != REG_OK)
2758         ERROR_EXIT(errcode);
2759
2760       if (tnfa->num_tags > 0)
2761         {
2762           tag_directions = xmalloc(sizeof(*tag_directions)
2763                                    * (tnfa->num_tags + 1));
2764           if (tag_directions == NULL)
2765             ERROR_EXIT(REG_ESPACE);
2766           tnfa->tag_directions = tag_directions;
2767           memset(tag_directions, -1,
2768                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2769         }
2770       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2771                                    sizeof(*tnfa->minimal_tags));
2772       if (tnfa->minimal_tags == NULL)
2773         ERROR_EXIT(REG_ESPACE);
2774
2775       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2776                               sizeof(*submatch_data));
2777       if (submatch_data == NULL)
2778         ERROR_EXIT(REG_ESPACE);
2779       tnfa->submatch_data = submatch_data;
2780
2781       errcode = tre_add_tags(mem, stack, tree, tnfa);
2782       if (errcode != REG_OK)
2783         ERROR_EXIT(errcode);
2784
2785     }
2786
2787   /* Expand iteration nodes. */
2788   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2789                            tag_directions);
2790   if (errcode != REG_OK)
2791     ERROR_EXIT(errcode);
2792
2793   /* Add a dummy node for the final state.
2794      XXX - For certain patterns this dummy node can be optimized away,
2795            for example "a*" or "ab*".   Figure out a simple way to detect
2796            this possibility. */
2797   tmp_ast_l = tree;
2798   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2799   if (tmp_ast_r == NULL)
2800     ERROR_EXIT(REG_ESPACE);
2801
2802   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2803   if (tree == NULL)
2804     ERROR_EXIT(REG_ESPACE);
2805
2806   errcode = tre_compute_nfl(mem, stack, tree);
2807   if (errcode != REG_OK)
2808     ERROR_EXIT(errcode);
2809
2810   counts = xmalloc(sizeof(int) * parse_ctx.position);
2811   if (counts == NULL)
2812     ERROR_EXIT(REG_ESPACE);
2813
2814   offs = xmalloc(sizeof(int) * parse_ctx.position);
2815   if (offs == NULL)
2816     ERROR_EXIT(REG_ESPACE);
2817
2818   for (i = 0; i < parse_ctx.position; i++)
2819     counts[i] = 0;
2820   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2821
2822   add = 0;
2823   for (i = 0; i < parse_ctx.position; i++)
2824     {
2825       offs[i] = add;
2826       add += counts[i] + 1;
2827       counts[i] = 0;
2828     }
2829   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2830   if (transitions == NULL)
2831     ERROR_EXIT(REG_ESPACE);
2832   tnfa->transitions = transitions;
2833   tnfa->num_transitions = add;
2834
2835   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2836   if (errcode != REG_OK)
2837     ERROR_EXIT(errcode);
2838
2839   tnfa->firstpos_chars = NULL;
2840
2841   p = tree->firstpos;
2842   i = 0;
2843   while (p->position >= 0)
2844     {
2845       i++;
2846       p++;
2847     }
2848
2849   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2850   if (initial == NULL)
2851     ERROR_EXIT(REG_ESPACE);
2852   tnfa->initial = initial;
2853
2854   i = 0;
2855   for (p = tree->firstpos; p->position >= 0; p++)
2856     {
2857       initial[i].state = transitions + offs[p->position];
2858       initial[i].state_id = p->position;
2859       initial[i].tags = NULL;
2860       /* Copy the arrays p->tags, and p->params, they are allocated
2861          from a tre_mem object. */
2862       if (p->tags)
2863         {
2864           int j;
2865           for (j = 0; p->tags[j] >= 0; j++);
2866           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2867           if (!initial[i].tags)
2868             ERROR_EXIT(REG_ESPACE);
2869           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2870         }
2871       initial[i].assertions = p->assertions;
2872       i++;
2873     }
2874   initial[i].state = NULL;
2875
2876   tnfa->num_transitions = add;
2877   tnfa->final = transitions + offs[tree->lastpos[0].position];
2878   tnfa->num_states = parse_ctx.position;
2879   tnfa->cflags = cflags;
2880
2881   tre_mem_destroy(mem);
2882   tre_stack_destroy(stack);
2883   xfree(counts);
2884   xfree(offs);
2885
2886   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2887   return REG_OK;
2888
2889  error_exit:
2890   /* Free everything that was allocated and return the error code. */
2891   tre_mem_destroy(mem);
2892   if (stack != NULL)
2893     tre_stack_destroy(stack);
2894   if (counts != NULL)
2895     xfree(counts);
2896   if (offs != NULL)
2897     xfree(offs);
2898   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2899   regfree(preg);
2900   return errcode;
2901 }
2902
2903
2904
2905
2906 void
2907 regfree(regex_t *preg)
2908 {
2909   tre_tnfa_t *tnfa;
2910   unsigned int i;
2911   tre_tnfa_transition_t *trans;
2912
2913   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2914   if (!tnfa)
2915     return;
2916
2917   for (i = 0; i < tnfa->num_transitions; i++)
2918     if (tnfa->transitions[i].state)
2919       {
2920         if (tnfa->transitions[i].tags)
2921           xfree(tnfa->transitions[i].tags);
2922         if (tnfa->transitions[i].neg_classes)
2923           xfree(tnfa->transitions[i].neg_classes);
2924       }
2925   if (tnfa->transitions)
2926     xfree(tnfa->transitions);
2927
2928   if (tnfa->initial)
2929     {
2930       for (trans = tnfa->initial; trans->state; trans++)
2931         {
2932           if (trans->tags)
2933             xfree(trans->tags);
2934         }
2935       xfree(tnfa->initial);
2936     }
2937
2938   if (tnfa->submatch_data)
2939     {
2940       for (i = 0; i < tnfa->num_submatches; i++)
2941         if (tnfa->submatch_data[i].parents)
2942           xfree(tnfa->submatch_data[i].parents);
2943       xfree(tnfa->submatch_data);
2944     }
2945
2946   if (tnfa->tag_directions)
2947     xfree(tnfa->tag_directions);
2948   if (tnfa->firstpos_chars)
2949     xfree(tnfa->firstpos_chars);
2950   if (tnfa->minimal_tags)
2951     xfree(tnfa->minimal_tags);
2952   xfree(tnfa);
2953 }